[HTMLParser]配合[正则表达式]使用 过滤爬取网页
爬虫已经能够把所有页面爬下来了,但是保存的是包括完整html标签的html文件,所以使用HtmlParser来过滤不需要的html标签。然后想到了直接使用了正则表达式匹配来去除98上的[url]这种BB功能标签,别看就一个正则表达式,其实折腾了好久。正则表达式匹配[中括号,使用\\[,同理匹配?使用\\?,别被网上一些言论误导。。不然都false。。还是自己查api比较靠谱。
HTMLParser的核心模块是org.htmlparser.Parser类,这个类实际完成了对于HTML页面的分析工作。这个类有下面几个构造函数:
public Parser ();
public Parser (Lexer lexer, ParserFeedback fb);
public Parser (URLConnection connection, ParserFeedback fb) throws ParserException;
public Parser (String resource, ParserFeedback feedback) throws ParserException;
public Parser (String resource) throws ParserException;
public Parser (Lexer lexer);
public Parser (URLConnection connection) throws ParserException;
和一个静态类 public static Parser createParser (String html, String charset);
HtmlParser的filter功能也很强大,支持各种组合。。14个filter如下,下面注释掉的代码有使用几种
判断类Filter:
TagNameFilter
HasAttributeFilter
HasChildFilter
HasParentFilter
HasSiblingFilter
IsEqualFilter
逻辑运算Filter:
AndFilter
NotFilter
OrFilter
XorFilter
其他Filter:
NodeClassFilter
StringFilter
LinkStringFilter
LinkRegexFilter
RegexFilter
CssSelectorNodeFilter
以下就来贴下完整的处理代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | package com.htmlparser; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.FileInputStream; import java.io.File; import java.io.OutputStreamWriter; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.NotFilter; import org.htmlparser.filters.StringFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.Parser; public class TestHtmlParser { private static String ENCODE = "UTF-8"; // 用于输出显示测试 private void message(String szMsg) { try { System.out.println(new String(szMsg.getBytes(ENCODE), System .getProperty("file.encoding"))); } catch (Exception e) { } } // 打开html文件,读取文件所有内容 public String openFile(String filename) { try { BufferedReader bis = new BufferedReader(new InputStreamReader( new FileInputStream(new File(filename)), ENCODE)); String content = ""; String temp; while ((temp = bis.readLine()) != null) { content += temp + "\n"; } bis.close(); return content; } catch (Exception e) { return ""; } } // 将解析好的字符串,写到.txt文件 public boolean writeFile(String filename, String content) { boolean result = false; try { OutputStreamWriter osw = new OutputStreamWriter( new FileOutputStream(new File("E:\\MyCrawl\\cc98-txt\\" + filename + ".txt")), ENCODE); osw.write(content); osw.close(); result = true; } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return result; } // 解析一个文件 public void parseFile(File file) { System.out.println("parsing:" + file.getName()); String content = openFile(file.getPath()); try { Parser parser = Parser.createParser(content, ENCODE); // TextExtractingVisitor visitor = new TextExtractingVisitor(); // parser.visitAllNodesWith(visitor); // String textInPage = visitor.getExtractedText(); // // message(textInPage); // NodeFilter Tagfilter = new TagNameFilter("span"); // NodeFilter Urlfilter = new StringFilter("http://"); // NodeFilter noUrlfilter = new NotFilter(Urlfilter); // NodeFilter filter = new AndFilter(Tagfilter, noUrlfilter); NodeFilter tagfilter = new TagNameFilter("span"); NodeFilter filterID = new HasAttributeFilter("id"); NodeFilter filter = new AndFilter(tagfilter, filterID); NodeList nodes = parser.extractAllNodesThatMatch(filter); String txt = ""; if (nodes != null) { for (int i = 0; i < nodes.size(); i++) { Node textnode = (Node) nodes.elementAt(i); // message("getText:" + textnode.getText()); txt += textnode.toPlainTextString() + "\n\r"; } } txt = txt.replaceAll("<a href="file://[/?[a-zA-Z0-9\\?\\">\\[/?[a-zA-Z0-9\\?\\</a>. /&~=:,&&[^\\]]]+\\]", ""); txt = txt.replaceAll("&nbsp;", "").replaceAll("[ \t]+", "");// .replaceAll("[\n]+", "\n"); writeFile(file.getName(), txt); // for (NodeIterator i = parser.elements(); i.hasMoreNodes();) { // Node node = i.nextNode(); // message("getText:" + node.getText()); // message("getPlainText:" + node.toPlainTextString()); // message("toHtml:" + node.toHtml()); // message("toHtml(true):" + node.toHtml(true)); // message("toHtml(false):" + node.toHtml(false)); // message("toString:" + node.toString()); // message("================================================="); // if(node.getText().length()>=3){ // if("html".equals(node.getText().substring(0, 4))){ // message("getPlainText:" + node.toPlainTextString()); // } // } // } } catch (Exception e) { System.out.println("Exception:" + e); } } // 主函数,遍历cc98目录 public static void main(String[] args) { TestHtmlParser thp = new TestHtmlParser(); LinkedList<String> folderList = new LinkedList<String>(); folderList.add("E:\\MyCrawl\\cc98\\"); while (folderList.size() > 0) { File file = new File(folderList.poll()); File[] files = file.listFiles(); List<File> fileList = new ArrayList<File>(); for (int i = 0; i < files.length; i++) { if (files[i].isDirectory()) { folderList.add(files[i].getPath()); } else { fileList.add(files[i]); } } for (File f : fileList) { thp.parseFile(f); } } } } |