自己写的一个正文提取算法,在三个网站上测试没问题
需要使用第三方的jar jsoup
package com.extract;import java.io.File;import java.io.IOException;import org.apache.commons.io.FileUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class ExtractNovel { public static void main(String[] args) throws IOException { //dijiuzww.com String test = FileUtils .readFileToString(new File("C://Users//Administrator//Desktop//sina.com"));// String test = FileUtils// .readFileToString(new File("C://Users//Administrator//Desktop//testextaractContent.txt"));// Document doc = Jsoup.parse(test); doc = denoiseElementForDoc(doc); // System.out.println(doc.text()); int size = doc.text().length(); Element e = doc.getAllElements().get(0); Element target[] = new Element[1]; check(e,size); } public static void check(Element e, float size) { Element son = findRealSon(e, size); System.out.println(son.toString()); System.out.println(son.text()); } public static Element findRealSon(Element e, float size) { Elements els = e.children(); Element son = null; for (Element tempson : els) { float length = tempson.text().length(); if (length / size > 0.75) { Element element = findRealSon(tempson, size); if(element ==null){ son = tempson; return son; }else{ son = element; } } } return son; } public static Document denoiseElementForDoc(Document document) { document.getElementsByTag("script").remove(); document.getElementsByTag("style").remove(); document.getElementsByTag("select").remove(); document.getElementsByTag("link").remove(); document.getElementsByTag("input").remove(); document.getElementsByTag("object").remove(); document.getElementsByTag("textarea").remove(); document.getElementsByTag("ul").remove(); document.getElementsByTag("img").remove(); document.getElementsByTag("a").attr("href", "javascript:void(0)").remove(); document.getElementsByAttributeValue("display", "none").remove(); document.getElementsByAttributeValueStarting("class", "foot").remove(); document.getElementsByAttributeValue("class", "settings").remove(); document.getElementsByAttributeValueContaining("style", "display:none").remove(); document.getElementsByAttributeValueContaining("style", "overflow: hidden").remove(); return document; }}