正则匹配HTML嵌套元素(一)
正则匹配HTML嵌套元素(一)
在js, java这种不支持正则平衡组高级特性时,如何用正则匹配HTML嵌套元素?
java版实现源码
第一步
查找目标元素开始位置,例根据id查找
var id = "home";
var regStr = "< *([^ >]*)[^>]*?\\bid *= *\""+id+"\"[^>]*>";
var match = new RegExp(regStr, "img");
var group = match.exec(html);
第二步
根据目标元素的标签,向下查找相同标签的打开和结束标签,并计数
如果是打开标签,计数加1,否则计数减1。当计数为0时结束查找
var regStrAll = "< */? *"+tag+"[^>]*>";
var matchAll = new RegExp(regStrAll, "img");
var regStrClose = "< */ *"+tag+" *>";
var matchClose = new RegExp(regStrClose, "im");
var openCount = 1;
var lastCloseIndex = 0;
while(openCount > 0){
var groupAll=matchAll.exec(html);
if(groupAll==null){
break;
}else{
if(matchClose.test(groupAll[0])){
openCount--;
lastCloseIndex = groupAll.index+groupAll[0].length;
}else{
openCount++;
}
}
}
这样就可以完成嵌套元素的匹配了
接下来我们对代码进行封装扩展,以实现getElementById、getElementByTag、getElementByClass三个常用函数
function getElementById(html, id){
return getElementByAttr(html, "id", id);
}
function getElementByTag(html, tag){
var regStr = "< *("+tag+")[^>]*>";
return queryElement(regStr, html);
}
function getElementByClass(html, className){
return getElementByAttr(html, "class", className);
}
function getElementByAttr(html, key, value){
var regStr = "< *([^ >]*)[^>]*?\\b"+key+" *= *\""+value+"\"[^>]*>";
return queryElement(regStr, html);
}
function queryElement(regStr, html){
var match = new RegExp(regStr, "img");
var group = match.exec(html);
if(group == null)return null;
var searchStart = group.index+group[0].length;
var closeIndex = queryCloseTag(group[1], html.substring(searchStart));
return html.substring(group.index, searchStart+closeIndex);
}
function queryCloseTag(tag, html){
var regStrAll = "< */? *"+tag+"[^>]*>";
var matchAll = new RegExp(regStrAll, "img");
var regStrClose = "< */ *"+tag+" *>";
var matchClose = new RegExp(regStrClose, "im");
var openCount = 1;
var lastCloseIndex = 0;
while(openCount > 0){
var groupAll=matchAll.exec(html);
if(groupAll==null){
break;
}else{
if(matchClose.test(groupAll[0])){
openCount--;
lastCloseIndex = groupAll.index+groupAll[0].length;
}else{
openCount++;
}
}
}
return lastCloseIndex;
}
特殊标签处理
1, 自结束标签如:
<br/>, <input type="text"/>
function queryElement(regStr, html){
var match = new RegExp(regStr, "img");
var group = match.exec(html);
if(group == null)return null;
var searchStart = group.index+group[0].length;
var closeIndex = 0;
if(/\/ *>$/.test(group[0])==false){
closeIndex = queryCloseTag(group[1], html.substring(searchStart));
}
return html.substring(group.index, searchStart+closeIndex);
}
2, 省略结束标签如:
<br>, <input type="text">
function queryCloseTag(tag, html){
var regStrAll = "< */? *"+tag+"[^>]*>";
var matchAll = new RegExp(regStrAll, "img");
var regStrClose = "< */ *"+tag+" *>";
var matchClose = new RegExp(regStrClose, "im");
var openCount = 1;
var lastCloseIndex = 0;
while(openCount > 0){
var groupAll=matchAll.exec(html);
if(groupAll==null){
break;
}else{
if(matchClose.test(groupAll[0])){
openCount--;
lastCloseIndex = groupAll.index+groupAll[0].length;
}else{
openCount++;
if(new RegExp("\\b"+tag+"\\b", "i").test("input br img"))return 0;
}
}
}
return lastCloseIndex;
}
getElementByClass改进
使用时发现getElementByClass经常同时需要多个class来查找元素,如getElementByClass(document.body.innerHtml, ".page.in")
function getElementByClass(html, classNames){
var classArr = classNames.split(".");
var classReg = "";
for(var i=0; i<classArr.length; i++){
var className = classArr[i];
if(className.length>0){
classReg+="(?=.*?\\b"+className+"\\b)";
}
}
var option = {
index: 2,
regStr: classReg
};
var regStr = "< *([^ |>]*)[^>]*?\\bclass *= *\"([^\"]*)\"[^>]*>";
return queryElement(regStr, html, option);
}
function queryElement(regStr, html, option){
var match = new RegExp(regStr, "img");
var group = match.exec(html);
if(group == null)return null;
if(option!=null){
while(group!=null){
var nextContent = group[option.index];
var nextMatch = new RegExp(option.regStr, "im");
if(nextMatch.test(nextContent)){
break;
}
group = match.exec(html);
}
if(group == null)return null;
}
var searchStart = group.index+group[0].length;
var closeIndex = 0;
if(/\/ *>$/.test(group[0])==false){
closeIndex = queryCloseTag(group[1], html.substring(searchStart));
}
return html.substring(group.index, searchStart+closeIndex);
}
改进queryElement
很多时候我们需要查找的结果是个数组,而不只是返回一个元素
function queryElement(regStr, html, option){
var match = new RegExp(regStr, "img");
var result = option.multiElement ? [] : null;
var group = match.exec(html);
while(group!=null){
if(option.regStr!=null){
while(group!=null){
var nextContent = group[option.index];
var nextMatch = new RegExp(option.regStr, "im");
if(nextMatch.test(nextContent)){
break;
}
group = match.exec(html);
}
if(group == null)return result;
}
var searchStart = group.index+group[0].length;
var closeIndex = 0;
if(/\/ *>$/.test(group[0])==false){
closeIndex = queryCloseTag(group[1], html.substring(searchStart));
}
var targetHtml = html.substring(group.index, searchStart+closeIndex);
if(result == null){
result = targetHtml;
break;
}else{
result.push(targetHtml);
group = match.exec(html);
}
}
return result;
}
最终代码
function getElementById(html, id){
return getElementByAttr(html, "id", id, false);
}
function getElementsByTag(html, tag){
var regStr = "< *("+tag+")[^>]*>";
return queryElement(regStr, html, {multiElement: true});
}
function getElementsByClass(html, classNames){
var classArr = classNames.split(".");
var classReg = "";
for(var i=0; i<classArr.length; i++){
var className = classArr[i];
if(className.length>0){
classReg+="(?=.*?\\b"+className+"\\b)";
}
}
var option = {
multiElement: true,
index: 2,
regStr: classReg
};
var regStr = "< *([^ |>]*)[^>]*?\\bclass *= *\"([^\"]*)\"[^>]*>";
return queryElement(regStr, html, option);
}
function getElementByAttr(html, key, value, multiElement){
var regStr = "< *([^ |>]*).*?\\b"+key+" *= *\""+value+"\"[^>]*>";
return queryElement(regStr, html, {multiElement: multiElement});
}
function queryElement(regStr, html, option){
var match = new RegExp(regStr, "img");
var result = option.multiElement ? [] : null;
var group = match.exec(html);
while(group!=null){
if(option.regStr!=null){
while(group!=null){
var nextContent = group[option.index];
var nextMatch = new RegExp(option.regStr, "im");
if(nextMatch.test(nextContent)){
break;
}
group = match.exec(html);
}
if(group == null)return result;
}
var searchStart = group.index+group[0].length;
var closeIndex = 0;
if(/\/ *>$/.test(group[0])==false){
closeIndex = queryCloseTag(group[1], html.substring(searchStart));
}
var targetHtml = html.substring(group.index, searchStart+closeIndex);
if(result == null){
result = targetHtml;
break;
}else{
result.push(targetHtml);
group = match.exec(html);
}
}
return result;
}
function queryCloseTag(tag, html){
var regStrAll = "< */? *"+tag+"[^>]*>";
var matchAll = new RegExp(regStrAll, "img");
var regStrClose = "< */ *"+tag+" *>";
var matchClose = new RegExp(regStrClose, "im");
var openCount = 1;
var lastCloseIndex = 0;
while(openCount > 0){
var groupAll=matchAll.exec(html);
if(groupAll==null){
break;
}else{
if(matchClose.test(groupAll[0])){
openCount--;
lastCloseIndex = groupAll.index+groupAll[0].length;
}else{
openCount++;
if(new RegExp("\\b"+tag+"\\b", "i").test("input br image"))return 0;
}
}
}
return lastCloseIndex;
}
下一篇 实现#(id)空格(层级).(类)混合查询