基于 webcollector 和 lucene 开发站内搜索
webcollector
网站爬虫
https://github.com/CrawlScript/WebCollector
lucene
文档索引与检索
https://lucene.apache.org/
爬取网页信息
public class SiteCrawler extends RamCrawler{
protected static final Logger LOG = Logger.getLogger(SiteCrawler.class);
List<Document> docList;
public SiteCrawler() {
super(true);
String domain = "http://www.fengshangbin.com/";
docList = new ArrayList<Document>();
this.addSeed(domain);
this.addRegex(domain+".*?");
this.addRegex("-.*\\.(jpg|png|gif|svg|css|js|pdf).*");
this.addRegex("-.*#.*");
setThreads(30);
getConf().setTopN(100);
}
@Override
public void visit(Page page, CrawlDatums next) {
String url = page.url();
String title = page.select("title").first().text();
String content = page.select("body").text();
Document luceneDocument = new Document();
TextField titleFiled = new TextField("title", title, Store.YES);
TextField contentFiled = new TextField("content", content, Store.YES);
StringField linkFiled = new StringField("link", url, Store.YES);
luceneDocument.add(titleFiled);
luceneDocument.add(contentFiled);
luceneDocument.add(linkFiled);
docList.add(luceneDocument);
}
}
索引爬取结果
在 SiteCrawler.java 中添加爬取结束后的方法
@Override
public void afterStop(){
try {
this.dbManager.clear();
LuceneUtils.indexDelAll(SiteConst.LUCENE_PATH);
LuceneUtils.indexUpdate(docList, SiteConst.LUCENE_PATH);
docList.clear();
} catch (Exception e) {
//e.printStackTrace();
LOG.error(e.getMessage());
}
LOG.info("site index end");
}
索引爬取结果
public static void indexUpdate(List<Document> docList, String indexSaveDir) throws IOException {
Analyzer analyzer = new StandardAnalyzer();
Path path = Paths.get(indexSaveDir);
Directory directory = FSDirectory.open(path);
IndexWriterConfig config = new IndexWriterConfig(analyzer);
IndexWriter indexWriter = new IndexWriter(directory, config);
for (int i = 0; i < docList.size(); i++) {
Document luceneDocument = docList.get(i);
Term term = new Term("link", luceneDocument.get("link"));
indexWriter.updateDocument(term, luceneDocument);
if ((i + 1) % 50 == 0) {
indexWriter.flush();
}
}
indexWriter.flush();
indexWriter.commit();
indexWriter.close();
}
查询数据
public static Map<String, Object> indexSearch(String indexDir, String queryWord, int start, int end) throws Exception {
Map<String, Object> result = new HashMap<String, Object>();
ArrayList<Map<String, String>> list=new ArrayList<Map<String, String>>();
result.put("list", list);
Analyzer analyzer = new StandardAnalyzer();
String[] fields = {"title", "content"};
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(fields, analyzer);
Query query = queryParser.parse(queryWord);
Path path = Paths.get(indexDir);
Directory dir = FSDirectory.open(path);
DirectoryReader directoryReader = DirectoryReader.open(dir);
IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
TopDocs topdocs = indexSearcher.search(query, end);
ScoreDoc[] scoreDocs = topdocs.scoreDocs;
ScoreDoc loopScoreDoc = null;
Formatter formatter = new SimpleHTMLFormatter("<span class='hight'>","</span>");
QueryScorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter,scorer);
//设置摘要
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
highlighter.setTextFragmenter(fragmenter);
result.put("total", topdocs.totalHits.value);
for (int i = start; i < scoreDocs.length; i++) {
loopScoreDoc = scoreDocs[i];
int docID = loopScoreDoc.doc;
Document document = directoryReader.document(docID);
String content = document.get("content");
String highlighterContent = highlighter.getBestFragment(analyzer, "content", content);
if(highlighterContent==null)highlighterContent=content.substring(0, Math.min(80, content.length()));
Map<String, String> item = new HashMap<String, String>();
item.put("link", document.get("link"));
item.put("title", document.get("title"));
item.put("content", highlighterContent+" ...");
list.add(item);
}
result.put("code", 200);
return result;
}
设置每日凌晨自动运行爬虫
@Override
public void contextInitialized(ServletContextEvent sce) {
// TODO Auto-generated method stub
LOG.info("site serach start:");
TimerTask task = new TimerTask() {
@Override
public void run() {
// task to run goes here
SiteCrawler crawler = new SiteCrawler();
try {
crawler.start(5);
} catch (Exception e) {
//e.printStackTrace();
LOG.error(e.getMessage());
}
}
};
Timer timer = new Timer();
long now = new Date().getTime();
SimpleDateFormat sdfOne = new SimpleDateFormat("yyyy-MM-dd");
long overTime=0;
try {
overTime = now - (sdfOne.parse(sdfOne.format(now)).getTime());
} catch (ParseException e) {
e.printStackTrace();
}
long intevalPeriod = 24 * 60 * 60 * 1000;
long delay = intevalPeriod - overTime;
LOG.info("site serach next index:"+delay);
timer.scheduleAtFixedRate(task, delay, intevalPeriod);
}