正则匹配HTML嵌套元素(一)

正则匹配HTML嵌套元素(一)

在js, java这种不支持正则平衡组高级特性时,如何用正则匹配HTML嵌套元素?
java版实现源码

第一步

查找目标元素开始位置,例根据id查找

var id = "home";
var regStr = "< *([^ >]*)[^>]*?\\bid *= *\""+id+"\"[^>]*>";
var match = new RegExp(regStr, "img");
var group = match.exec(html);

第二步

根据目标元素的标签,向下查找相同标签的打开和结束标签,并计数
如果是打开标签,计数加1,否则计数减1。当计数为0时结束查找

var regStrAll = "< */? *"+tag+"[^>]*>";
var matchAll = new RegExp(regStrAll, "img");

var regStrClose = "< */ *"+tag+" *>";
var matchClose = new RegExp(regStrClose, "im");

var openCount = 1;
var lastCloseIndex = 0;
while(openCount > 0){
	var groupAll=matchAll.exec(html);
	if(groupAll==null){
		break;
	}else{
		if(matchClose.test(groupAll[0])){
			openCount--;
			lastCloseIndex = groupAll.index+groupAll[0].length;
		}else{
			openCount++;
		}
	}
}

这样就可以完成嵌套元素的匹配了
接下来我们对代码进行封装扩展,以实现getElementById、getElementByTag、getElementByClass三个常用函数

function getElementById(html, id){
	return getElementByAttr(html, "id", id);
}
function getElementByTag(html, tag){
	var regStr = "< *("+tag+")[^>]*>";
	return queryElement(regStr, html);
}
function getElementByClass(html, className){
	return getElementByAttr(html, "class", className);
}
function getElementByAttr(html, key, value){
	var regStr = "< *([^ >]*)[^>]*?\\b"+key+" *= *\""+value+"\"[^>]*>";
	return queryElement(regStr, html);
}
function queryElement(regStr, html){
	var match = new RegExp(regStr, "img");
	var group = match.exec(html);
	if(group == null)return null;
	var searchStart = group.index+group[0].length;
	var closeIndex = queryCloseTag(group[1], html.substring(searchStart));
	return html.substring(group.index, searchStart+closeIndex);
}
function queryCloseTag(tag, html){
	var regStrAll = "< */? *"+tag+"[^>]*>";
	var matchAll = new RegExp(regStrAll, "img");

	var regStrClose = "< */ *"+tag+" *>";
	var matchClose = new RegExp(regStrClose, "im");

	var openCount = 1;
	var lastCloseIndex = 0;
	while(openCount > 0){
		var groupAll=matchAll.exec(html);
		if(groupAll==null){
			break;
		}else{
			if(matchClose.test(groupAll[0])){
				openCount--;
				lastCloseIndex = groupAll.index+groupAll[0].length;
			}else{
				openCount++;
			}
		}
	}
	return lastCloseIndex;
}

特殊标签处理

1, 自结束标签如:

<br/>, <input type="text"/>

function queryElement(regStr, html){
	var match = new RegExp(regStr, "img");
	var group = match.exec(html);
	if(group == null)return null;
	var searchStart = group.index+group[0].length;
	var closeIndex = 0;
	if(/\/ *>$/.test(group[0])==false){
		closeIndex = queryCloseTag(group[1], html.substring(searchStart));
	}
	return html.substring(group.index, searchStart+closeIndex);
}

2, 省略结束标签如:

<br>, <input type="text">

function queryCloseTag(tag, html){
	var regStrAll = "< */? *"+tag+"[^>]*>";
	var matchAll = new RegExp(regStrAll, "img");

	var regStrClose = "< */ *"+tag+" *>";
	var matchClose = new RegExp(regStrClose, "im");

	var openCount = 1;
	var lastCloseIndex = 0;
	while(openCount > 0){
		var groupAll=matchAll.exec(html);
		if(groupAll==null){
			break;
		}else{
			if(matchClose.test(groupAll[0])){
				openCount--;
				lastCloseIndex = groupAll.index+groupAll[0].length;
			}else{
				openCount++;
				if(new RegExp("\\b"+tag+"\\b", "i").test("input br img"))return 0;
			}
		}
	}
	return lastCloseIndex;
}

getElementByClass改进

使用时发现getElementByClass经常同时需要多个class来查找元素,如getElementByClass(document.body.innerHtml, ".page.in")

function getElementByClass(html, classNames){
	var classArr = classNames.split(".");
	var classReg = "";
	for(var i=0; i<classArr.length; i++){
		var className = classArr[i];
		if(className.length>0){
			classReg+="(?=.*?\\b"+className+"\\b)";
		}
	}
	var option = {
		index: 2,
		regStr: classReg
	};
	var regStr = "< *([^ |>]*)[^>]*?\\bclass *= *\"([^\"]*)\"[^>]*>";
	return queryElement(regStr, html, option);
}
function queryElement(regStr, html, option){
	var match = new RegExp(regStr, "img");
	var group = match.exec(html);
	if(group == null)return null;
	if(option!=null){
		while(group!=null){
			var nextContent = group[option.index];
			var nextMatch = new RegExp(option.regStr, "im");
			if(nextMatch.test(nextContent)){
				break;
			}
			group = match.exec(html);
		}
		if(group == null)return null;
	}
	var searchStart = group.index+group[0].length;
	var closeIndex = 0;
	if(/\/ *>$/.test(group[0])==false){
		closeIndex = queryCloseTag(group[1], html.substring(searchStart));
	}
	return html.substring(group.index, searchStart+closeIndex);
}

改进queryElement

很多时候我们需要查找的结果是个数组,而不只是返回一个元素

function queryElement(regStr, html, option){
	var match = new RegExp(regStr, "img");
	var result = option.multiElement ? [] : null;
	var group = match.exec(html);
	while(group!=null){
		if(option.regStr!=null){
			while(group!=null){
				var nextContent = group[option.index];
				var nextMatch = new RegExp(option.regStr, "im");
				if(nextMatch.test(nextContent)){
					break;
				}
				group = match.exec(html);
			}
			if(group == null)return result;
		}
		var searchStart = group.index+group[0].length;
		var closeIndex = 0;
		if(/\/ *>$/.test(group[0])==false){
			closeIndex = queryCloseTag(group[1], html.substring(searchStart));
		}
		var targetHtml = html.substring(group.index, searchStart+closeIndex);
		if(result == null){
			result = targetHtml;
			break;
		}else{
			result.push(targetHtml);
			group = match.exec(html);
		}
	}
	return result;
}

最终代码

function getElementById(html, id){
	return getElementByAttr(html, "id", id, false);
}
function getElementsByTag(html, tag){
	var regStr = "< *("+tag+")[^>]*>";
	return queryElement(regStr, html, {multiElement: true});
}
function getElementsByClass(html, classNames){
	var classArr = classNames.split(".");
	var classReg = "";
	for(var i=0; i<classArr.length; i++){
		var className = classArr[i];
		if(className.length>0){
			classReg+="(?=.*?\\b"+className+"\\b)";
		}
	}
	var option = {
		multiElement: true,
		index: 2,
		regStr: classReg
	};
	var regStr = "< *([^ |>]*)[^>]*?\\bclass *= *\"([^\"]*)\"[^>]*>";
	return queryElement(regStr, html, option);
}
function getElementByAttr(html, key, value, multiElement){
	var regStr = "< *([^ |>]*).*?\\b"+key+" *= *\""+value+"\"[^>]*>";
	return queryElement(regStr, html, {multiElement: multiElement});
}
function queryElement(regStr, html, option){
	var match = new RegExp(regStr, "img");
	var result = option.multiElement ? [] : null;
	var group = match.exec(html);
	while(group!=null){
		if(option.regStr!=null){
			while(group!=null){
				var nextContent = group[option.index];
				var nextMatch = new RegExp(option.regStr, "im");
				if(nextMatch.test(nextContent)){
					break;
				}
				group = match.exec(html);
			}
			if(group == null)return result;
		}
		var searchStart = group.index+group[0].length;
		var closeIndex = 0;
		if(/\/ *>$/.test(group[0])==false){
			closeIndex = queryCloseTag(group[1], html.substring(searchStart));
		}
		var targetHtml = html.substring(group.index, searchStart+closeIndex);
		if(result == null){
			result = targetHtml;
			break;
		}else{
			result.push(targetHtml);
			group = match.exec(html);
		}
	}
	return result;
}
function queryCloseTag(tag, html){
	var regStrAll = "< */? *"+tag+"[^>]*>";
	var matchAll = new RegExp(regStrAll, "img");

	var regStrClose = "< */ *"+tag+" *>";
	var matchClose = new RegExp(regStrClose, "im");

	var openCount = 1;
	var lastCloseIndex = 0;
	while(openCount > 0){
		var groupAll=matchAll.exec(html);
		if(groupAll==null){
			break;
		}else{
			if(matchClose.test(groupAll[0])){
				openCount--;
				lastCloseIndex = groupAll.index+groupAll[0].length;
			}else{
				openCount++;
				if(new RegExp("\\b"+tag+"\\b", "i").test("input br image"))return 0;
			}
		}
	}
	return lastCloseIndex;
}

下一篇 实现#(id)空格(层级).(类)混合查询