数据提取----httpclient,htmlparser,xpath

网页数据提取的方法很多，从其基本原理来说很多就是通过模拟http请求，发送给服务器，然后接收响应，解析响应的结果。整个过程说简单也简单，说复杂也复杂。这里来整理下做过的一些事，走过的路，遇到的坑。

1，基本思路

这里举一个java下载的例子，说明简单的思路。

public void downPDF(String urlString, String filename, String pdf,

String chk, String chk1, String chk2, String chk3) throws Exception {

URL server = new URL(urlString);

HttpURLConnection connection = (HttpURLConnection) server

.openConnection();

connection.setRequestMethod("GET");

connection.setDoInput(true);

connection.setDoOutput(true);

connection.setUseCaches(false);

connection.addRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");

connection.addRequestProperty("Accept-Language", "zh-cn,zh;q=0.5");

connection.addRequestProperty("Accept-Encoding", "gzip, deflate");

connection.addRequestProperty("Accept-Charset","GB2312,utf-8;q=0.7,*;q=0.7");

connection.addRequestProperty("Cookie","chk=");

connection.addRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR2.0.50727; MS-RTC LM 8)");

// if (headers != null) {

// for (Header h : headers) {

// System.out.println(h.getName() + ":" + h.getValue());

// connection.addRequestProperty(h.getName(), h.getValue());

// }

connection.connect();

InputStream is = connection.getInputStream();

OutputStream os = new FileOutputStream(filename);

byte[] buffer = new byte[1024 * 128];

if (true) {

int byteReaded = is.read(buffer);

while (byteReaded != -1) {

os.write(buffer, 0, byteReaded);

byteReaded = is.read(buffer);

}

os.close();

}

可以看出如果是html,过程也是一样，这样很容易获得服务器端的响应。问题比较多的地方就是中间的那一段addRequestProperty，究竟该是什么，这个就需要明白http的整个过程，包括cookie和session之类的东东了。

2，使用httpclient

上面的基本思路要满足各种情况还是有很多不足的，于是就出现了httpclient这个apache的开源项目。

HttpClient 提供的主要的功能，要知道更多详细的功能可以参见 HttpClient 的主页。

（1）实现了所有 HTTP 的方法（GET,POST,PUT,HEAD 等）

（2）支持自动转向

（3）支持 HTTPS 协议

（4）支持代理服务器等

HttpHost targetHost = new HttpHost("192.168.1.1", 80, "http");

DefaultHttpClient httpclient = new DefaultHttpClient();

httpclient.getCredentialsProvider().setCredentials(

new AuthScope(targetHost.getHostName(), targetHost.getPort()),

new UsernamePasswordCredentials("admin", "99714029lyl"));

// Create AuthCache instance

AuthCache authCache = new BasicAuthCache();

// Generate BASIC scheme object and add it to the local auth cache

BasicScheme basicAuth = new BasicScheme();

authCache.put(targetHost, basicAuth);

// Add AuthCache to the execution context

BasicHttpContext localcontext = new BasicHttpContext();

localcontext.setAttribute(ClientContext.AUTH_CACHE, authCache);

String url = URL;

HttpPost httpPost = new HttpPost(url);

httpPost.setHeader("Content-Type", "application/x-www-form-urlencoded");

for (int i = 0; i < 3; i++) {

HttpResponse response;

try {

response = httpclient.execute(targetHost, httpPost,

localcontext);

HttpEntity entity = response.getEntity();

System.out.println(response.getStatusLine() + "==="

+ EntityUtils.toString(response.getEntity()));

//EntityUtils.consume(entity);

} catch (ClientProtocolException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

上面是一个简单的例子，需要说明的是这个很容易和commons-httpclient混淆。commons-httpclient是一个遗留版本，现在官方已经不推荐使用了，这个版本中好像也没有关于访问https站点中需要加载SSLContext的类。httpclient是官方现在还在开发的，还在维护的，功能也比较强大的组件。

3.使用htmlparser

这个的功能也比较强大，不过重点在怎么解析http响应，获得html文档中的各个元素。

public static Parser getParser(String url, String encode) {

Parser parser = null;

try {

parser = new Parser(url);

if (conManager == null) {

conManager = new ConnectionManager();

Hashtable<String, String> properties = new Hashtable<String, String>();

properties.put("User-Agent", "IE/6.0");

ConnectionManager.setDefaultRequestProperties(properties);

parser.setConnectionManager(conManager);

}

parser.setEncoding(encode);

} catch (Exception e) {

System.out.println("You must be offline! " + e);

// TempUrlDAO dao = new TempUrlDAO();

// dao.updateTempUrlWeb(dao, url, webID);

return null;

}

return parser;

}

public static List getImages(Parser parser, NodeList collectionList) {

List list = new ArrayList();

TagNameFilter filterImg = new TagNameFilter("IMG");

try {

for (NodeIterator e = parser.elements(); e.hasMoreNodes();)

e.nextNode().collectInto(collectionList, filterImg);

} catch (ParserException e) {

e.printStackTrace();

}

for (SimpleNodeIterator e = collectionList.elements(); e.hasMoreNodes();) {

Node node = e.nextNode();

if (node instanceof ImageTag) {

ImageTag image = (ImageTag) node;

// System.out.println(image.getImageURL() + "["

// + image.getAttribute("height") + ","

// + image.getAttribute("width") + "]");

list.add(image);

}

// assertTrue("Only images should have been parsed",node instanceof

// ImageTag);

}

return list;

}

@SuppressWarnings("serial")

public static String getMainText(Parser parser) {

String content = "";

try {

parser.extractAllNodesThatMatch(new NodeFilter() {

int i = 0;

public boolean accept(Node node) {

if ((node instanceof ScriptTag)

&& (node.getText().startsWith("script"))) {

i = 1;

}

if ((node instanceof ScriptTag)

&& (node.getText().startsWith("/script"))) {

i = 0;

}

if ((node instanceof StyleTag)

&& (node.getText().startsWith("style"))) {

i = 1;

}

if ((node instanceof StyleTag)

&& (node.getText().startsWith("/style"))) {

i = 0;

}

// System.out.println(node.getClass().getName() + "------"

// + node.getText());

// if (i == 1) {

// return false;

// } else

return true;

}

});

parser.reset();

for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {

Node n = e.nextNode();

System.out.println("----------------" + n.toPlainTextString());

content += n.toPlainTextString();

}

return content.replaceAll(" | |", "");

} catch (ParserException e) {

e.printStackTrace();

}

return content;

}

这里只是列举了几个用法，很多其他的遍历文档，查找节点的用法类似。抽取里面想要的东西也是很容易的事情，当然少不了不少体力活。

还想留下本人处理表格时候用到的一个方法，其实做得好的话，可以做到比较智能的抽取表格数据。

public List<String> processTable(Parser parser) {

List<String> listTable = new ArrayList<String>();

NodeList list;

try {

list = parser.extractAllNodesThatMatch(new NodeClassFilter(

TableTag.class));

// int maxScore = 0;

// String detail = "";

// System.out.println("----" + list.size() + "---");

for (int i = 0; i < list.size(); i++) {

TableTag table = (TableTag) list.elementAt(i);

// System.out.println("----" + i + "---" + table.toHtml());

TableRow tr[] = table.getRows();

for (int j = 0; j < tr.length; j++) {

// System.out.println("===" + table.getRowCount() + ","

// + tr[j].getColumnCount() + "===");

if (table.getRowCount() >= 5 && tr[j].getColumnCount() >= 5) {

StringBuilder sb = new StringBuilder();

TableColumn tc[] = tr[j].getColumns();

for (int k = 0; k < tc.length; k++) {

// System.out.println(tc[k].toHtml());

// System.out.println(tc[k].toPlainTextString()

// + "X#X");

sb.append(tc[k].toPlainTextString() + Util.TOKEN);

}

// System.out.println();

listTable.add(sb.toString());

}

} catch (ParserException e) {

e.printStackTrace();

}

return listTable;

}

public List<String> processTable(Parser parser, int row, int col) {

List<String> listTable = new ArrayList<String>();

NodeList list = null;

try {

list = parser.extractAllNodesThatMatch(new NodeClassFilter(

TableTag.class));

// int maxScore = 0;

// String detail = "";

for (int i = 0; i < list.size(); i++) {

// System.out.println("----" + i + "---");

TableTag table = (TableTag) list.elementAt(i);

TableRow tr[] = table.getRows();

for (int j = 0; j < tr.length; j++) {

// System.out.println("===" + table.getRowCount() + ","

// + tr[j].getColumnCount() + "===");

if (table.getRowCount() >= row

&& tr[j].getColumnCount() == col) {

StringBuilder sb = new StringBuilder();

TableColumn tc[] = tr[j].getColumns();

for (int k = 0; k < tc.length; k++) {

// System.out.println(tc[k].toPlainTextString()

// + "X#X");

sb.append(tc[k].toPlainTextString() + Util.TOKEN);

}

// System.out.println();

listTable.add(sb.toString());

}

} catch (ParserException e) {

e.printStackTrace();

}

// if (list != null) {

// list.removeAll();

// }

return listTable;

}

4，关于xpath

说话的Xpath，这个是少不了的，在数据抽取的时候。很多时候xpath是让人又爱又恨，尤其是针对html的时候，因为xpath是适合xml,而不是html,这样就造成了很多不便。

具体细节不多说，根据xpath，遍历节点查找，解析数据。下面列出代码片段

public static List<String> getDetailInfo(String city, String url, String img) {

// System.out.println(URL);

List<String> list = new ArrayList<String>();

HtmlPage page = null;

try {

page = webClient.getPage(url);

String courseName = getData(page,

"/html/body/div[6]/div[1]/div[1]/div[1]/h1", null);

List<?> hbList = page

.getByXPath("/html/body/div[6]/div[1]/div[1]/div[2]/div[2]");

HtmlDivision hb = (HtmlDivision) hbList.get(0);

String lines[] = hb.asText().split("\r\n");

for (String str : lines) {

list.add(str);

// System.out.println(str);

}

} catch (Exception ee) {

// Thread.sleep(getSleep());

ee.printStackTrace();

}

..........

}

网站首页 > 技术文章正文

猜你喜欢

网站首页 > 技术文章 正文

数据提取----httpclient,htmlparser,xpath

猜你喜欢

网站首页 > 技术文章正文