Java tutorial
/* * Copyright 2013 Eolya Consulting - http://www.eolya.fr/ * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package fr.eolya.extraction.tika; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.MalformedURLException; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import org.apache.commons.io.FileUtils; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.html.BoilerpipeContentHandler; import org.apache.tika.sax.ExpandedTitleContentHandler; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.safety.Cleaner; import org.jsoup.safety.Whitelist; import org.jsoup.select.Elements; import de.jetwick.snacktory.ArticleTextExtractor; import de.jetwick.snacktory.JResult; import de.jetwick.snacktory.OutputFormatter; import de.l3s.boilerpipe.extractors.ArticleExtractor; import de.l3s.boilerpipe.extractors.CanolaExtractor; import de.l3s.boilerpipe.extractors.DefaultExtractor; import com.developpez.adiguba.shell.Shell; import fr.eolya.extraction.htmlformater.IHtmlFormater; /** * Wraps Apache Tika library in order to allow a simple usage and add or improve some features. * * @author Eolya Consulting - http://www.eolya.fr/ */ public class TikaWrapper { public static String OUTPUT_FORMAT_XML = "xml"; public static String OUTPUT_FORMAT_HTML = "html"; public static String OUTPUT_FORMAT_TEXT = "text"; public static String OUTPUT_FORMAT_TEXT_MAIN = "text_main"; public static String OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY = "text_main_snacktory"; public static String OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT = "text_main_boilerpipe_default"; public static String OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE = "text_main_boilerpipe_article"; public static String OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA = "text_main_boilerpipe_canola"; public static String CONTENT_TYPE_PDF = "application/pdf"; public static String CONTENT_TYPE_SWF = "application/x-shockwave-flash"; public static String CONTENT_TYPE_HTML = "text/html"; public static String CONTENT_TYPE_DJVU = "image/vnd.djvu "; private static String META_TITLE = "title"; private static String META_AUTHOR = "Author"; private static String META_CREATED = "Creation-Date"; private static String META_MODIFIED = "modified"; private static String META_CONTENTTYPE = "Content-Type"; private static String META_CONTENTSIZE = "Content-Size"; private class OutputType { public void process(InputStream input, OutputStream output, Metadata metadata) throws Exception { Parser p = parser; ContentHandler handler = getContentHandler(output, metadata); p.parse(input, handler, metadata, context); } protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { throw new UnsupportedOperationException(); } } private final OutputType XML = new OutputType() { @Override protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { return getTransformerHandler(output, "xml", encoding, prettyPrint); } }; private final OutputType HTML = new OutputType() { @Override protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { return new ExpandedTitleContentHandler(getTransformerHandler(output, "html", encoding, prettyPrint)); } }; private final OutputType TEXT = new OutputType() { @Override protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { return new BodyContentHandler(getOutputWriter(output, encoding)); } }; private final OutputType TEXT_MAIN = new OutputType() { @Override protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { return new BoilerpipeContentHandler(getOutputWriter(output, encoding)); } }; /** * Returns a output writer with the given encoding. * * @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a> * @param output output stream * @param encoding output encoding, * or <code>null</code> for the platform default * @return output writer * @throws UnsupportedEncodingException * if the given encoding is not supported */ private static Writer getOutputWriter(OutputStream output, String encoding) throws UnsupportedEncodingException { if (encoding != null) { return new OutputStreamWriter(output, encoding); } else if (System.getProperty("os.name").toLowerCase().startsWith("mac os x")) { // TIKA-324: Override the default encoding on Mac OS X return new OutputStreamWriter(output, "UTF-8"); } else { return new OutputStreamWriter(output); } } /** * Returns a transformer handler that serializes incoming SAX events * to XHTML or HTML (depending the given method) using the given output * encoding. * * @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a> * @param output output stream * @param method "xml" or "html" * @param encoding output encoding, * or <code>null</code> for the platform default * @return {@link System#out} transformer handler * @throws TransformerConfigurationException * if the transformer can not be created */ private static TransformerHandler getTransformerHandler(OutputStream output, String method, String encoding, boolean prettyPrint) throws TransformerConfigurationException { SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, prettyPrint ? "yes" : "no"); if (encoding != null) { handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, encoding); } handler.setResult(new StreamResult(output)); return handler; } private ParseContext context; private Parser parser; private boolean prettyPrint = true; private Detector detector; private OutputType type = null; private String outputFormat; private IHtmlFormater formater; private String tmpPath = null; private String pdfToTextPath = null; private String swfToHtmlPath = null; private String djVuTextPath = null; private String contentType; private Metadata metadata; private ByteArrayOutputStream output; private HashMap<String, String> meta; private HashMap<String, String> meta2; private String text; /** * Output character encoding, or <code>null</code> for platform default */ private String encoding = null; /** * Password for opening encrypted documents, or <code>null</code>. */ private String password = null; public TikaWrapper(String outputFormat, String outputEncoding) throws Exception { encoding = outputEncoding; if (encoding == null || "".equals(encoding)) encoding = "UTF-8"; context = new ParseContext(); detector = new DefaultDetector(); parser = new AutoDetectParser(detector); this.outputFormat = outputFormat; //this.contentType = contentType; this.formater = null; context.set(Parser.class, parser); context.set(PasswordProvider.class, new PasswordProvider() { public String getPassword(Metadata metadata) { return password; } }); } public TikaWrapper(String outputFormat) throws Exception { this(outputFormat, "UTF-8"); } public void process(InputStream input) throws MalformedURLException { process(input, null); } public void process(InputStream input, String contentType) throws MalformedURLException { try { this.contentType = contentType; if (OUTPUT_FORMAT_XML.equals(outputFormat)) { type = XML; } else if (OUTPUT_FORMAT_HTML.equals(outputFormat)) { type = HTML; } else if (OUTPUT_FORMAT_TEXT.equals(outputFormat)) { type = TEXT; } else if (OUTPUT_FORMAT_TEXT_MAIN.equals(outputFormat)) { type = TEXT_MAIN; } else { if (contentType == null || "".equals(contentType)) throw new Exception("Incoherent parameters (missing content-type)"); if (!CONTENT_TYPE_HTML.equals(contentType) && (OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY.equals(outputFormat) || OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT.equals(outputFormat) || OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE.equals(outputFormat) || OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA.equals(outputFormat))) { throw new Exception("Incoherent parameters (text/html content-type expected)"); } } text = null; meta2 = null; metadata = null; meta = null; if (usePftToText()) { processWithPdfToText(input); } else if (useSwfToHtml()) { processWithSwfToHtml(input); } else if (useDjVuText()) { processWithDjVuText(input); } else if (useAlternateHtmlParser()) { htmlToText(input); } else { metadata = new Metadata(); processWithTika(TikaInputStream.get(input)); } } catch (Exception e) { } } private void processWithTika(InputStream input) { try { output = new ByteArrayOutputStream(); try { type.process(input, output, metadata); } finally { input.close(); } } catch (Exception e) { } } private void htmlToText(InputStream input) { String rawData = convertStreamToString(input); try { Document doc = Jsoup.parse(rawData); meta2 = new HashMap<String, String>(); if (OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY.equals(outputFormat)) { ArticleTextExtractor extractor = new ArticleTextExtractor(); OutputFormatter outputFormater = new OutputFormatter(10); outputFormater.setNodesToKeepCssSelector("p,h1,h2,h3,h4,h5,h6"); extractor.setOutputFormatter(outputFormater); JResult res = extractor.extractContent(rawData); text = res.getText(); meta2.put(META_TITLE, res.getTitle()); //date = res.getDate(); // yyyy/mm/dd /* date = SHelper.completeDate(SHelper.estimateDate(url)); if (date!=null) { Pattern p = Pattern.compile("^([0-9]{4})\\/([0-9]{2})\\/([0-9]{2})"); Matcher m = p.matcher(date); if (m.find()) { date = m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " 00:00:00"; } else { date = ""; } } else { date = ""; } */ //imageUrl = res.getImageUrl(); //imageUrl = HttpUtils.urlGetAbsoluteURL(url, res.getImageUrlBestMatch()); } else { if (OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE.equals(outputFormat)) text = ArticleExtractor.INSTANCE.getText(rawData); if (OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT.equals(outputFormat)) text = DefaultExtractor.INSTANCE.getText(rawData); if (OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA.equals(outputFormat)) text = CanolaExtractor.INSTANCE.getText(rawData); if (doc != null) { meta2.put(META_TITLE, doc.select("title").text()); } } if (doc != null) { if (getMetaContent(doc, "Author") != null && !"".equals(getMetaContent(doc, "Author"))) meta2.put(META_AUTHOR, getMetaContent(doc, "Author")); String creationDate = getMetaContent(doc, "CreationDate"); if (creationDate != null) { // 20130322143113Z00'00' -> 2013-03-22T14:31:13Z Pattern p = Pattern.compile("[0-9]{14}Z[0-9]{2}'[0-9]{2}'"); Matcher m = p.matcher(creationDate); if (m.find()) { String value = String.format("%1$s-%2$s-%3$sT%4$s:%5$s:%6$sZ", creationDate.substring(0, 4), creationDate.substring(4, 6), creationDate.substring(6, 8), creationDate.substring(8, 10), creationDate.substring(10, 12), creationDate.substring(12, 14)); meta2.put(META_CREATED, value); } else { // 20130322143113+02'00' -> 2013-03-22T14:31:13Z p = Pattern.compile("[0-9]{14}\\+[0-9]{2}'[0-9]{2}'"); m = p.matcher(creationDate); if (m.find()) { String value = String.format("%1$s-%2$s-%3$sT%4$s:%5$s:%6$sZ", creationDate.substring(0, 4), creationDate.substring(4, 6), creationDate.substring(6, 8), creationDate.substring(8, 10), creationDate.substring(10, 12), creationDate.substring(12, 14)); meta2.put(META_CREATED, value); } } } } meta2.put(META_CONTENTSIZE, String.valueOf(rawData.length())); meta2.put(META_CONTENTTYPE, CONTENT_TYPE_HTML); } catch (Exception e) { e.printStackTrace(); } } private static String convertStreamToString(InputStream input) { try { InputStreamReader is = new InputStreamReader(input); StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(is); String read = br.readLine(); while (read != null) { sb.append(read); read = br.readLine(); } return sb.toString(); } catch (Exception e) { return null; } } public String getText() { if (output != null) return output.toString(); return text; } public String getMetaAuthor() { return getMetas() != null ? getMetas().get(META_AUTHOR) : null; } public String getMetaCreated() { return getMetas() != null ? getMetas().get(META_CREATED) : null; } public String getMetaTitle() { return getMetas() != null ? getMetas().get(META_TITLE) : null; } public String getMetaModified() { return getMetas() != null ? getMetas().get(META_MODIFIED) : null; } public String getMetaContentType() { if (getMetas() == null) return null; String value = getMetas().get(META_CONTENTTYPE); if (value != null && value.indexOf(";") != -1) value = value.substring(0, value.indexOf(";")).trim(); return value; } public String getMetaCharSet() { if (getMetas() == null) return null; String value = getMetas().get(META_CONTENTTYPE); if (value != null && value.indexOf(";") != -1) value = value.substring(value.indexOf(";") + 1).trim(); else value = null; return value; } public Map<String, String> getMetas() { if (meta2 != null) return meta2; if (meta == null && metadata != null) { meta = new HashMap<String, String>(); String[] names = metadata.names(); for (String name : names) { for (String value : metadata.getValues(name)) { meta.put(name, value); } } } return meta; } public void setTempPath(String tempPath) { this.tmpPath = tempPath; } public void setPdfToTextPath(String pdfToTextPath) { this.pdfToTextPath = pdfToTextPath; } private boolean usePftToText() { return (pdfToTextPath != null && !"".equals(pdfToTextPath) && CONTENT_TYPE_PDF.equals(contentType)); } public void setSwfToHtmlPath(String swfToHtmlPath) { this.swfToHtmlPath = swfToHtmlPath; } private boolean useSwfToHtml() { return (swfToHtmlPath != null && !"".equals(swfToHtmlPath) && CONTENT_TYPE_SWF.equals(contentType)); } public void setDjVuTextPath(String djVuTextPath) { this.djVuTextPath = djVuTextPath; } private boolean useDjVuText() { return (djVuTextPath != null && !"".equals(djVuTextPath) && CONTENT_TYPE_DJVU.equals(contentType)); } public void setHtmlFormater(IHtmlFormater formater) { this.formater = formater; } public boolean useAlternateHtmlParser() { return (CONTENT_TYPE_HTML.equals(contentType) && (OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY.equals(outputFormat) || OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT.equals(outputFormat) || OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE.equals(outputFormat) || OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA.equals(outputFormat))); } private String getMetaContent(Document doc, String metaName) { Elements e = doc.select("meta[name=" + metaName + "]"); if (e == null || e.first() == null) return null; return e.first().attr("content"); } private boolean writeToFile(File tempFile, InputStream input) { try { OutputStream out = new FileOutputStream(tempFile); byte buf[] = new byte[1024]; int len; while ((len = input.read(buf)) > 0) out.write(buf, 0, len); out.close(); input.close(); } catch (Exception e) { if (tempFile != null && tempFile.exists()) tempFile.delete(); e.printStackTrace(); return false; } return true; } private void processWithPdfToText(InputStream input) { File tempFile = null; File tempFile2 = null; try { if (input != null && pdfToTextPath != null && !"".equals(pdfToTextPath)) { // Get a local copy of the file tempFile = createTempFile("tmp", ".pdf", tmpPath); if (!writeToFile(tempFile, input)) return; meta2 = new HashMap<String, String>(); meta2.put(META_CONTENTSIZE, String.valueOf(tempFile.length())); tempFile2 = createTempFile("tmp", ".html", tmpPath); Shell sh = new Shell(); // Convert with PDFTOTEXT - pdftotext -enc UTF-8 -raw -q -htmlmeta -eol unix in.pdf out.html sh.exec(pdfToTextPath, "-enc", "UTF-8", "-raw", "-q", "-htmlmeta", "-eol", "unix", tempFile.getAbsolutePath(), tempFile2.getAbsolutePath()).consumeAsString(); tempFile.delete(); // Load in string and add the <meta http-equiv='Content-Type' content='text/html; charset=utf-8'> line InputStreamReader fr1 = new InputStreamReader(new FileInputStream(tempFile2), "UTF-8"); BufferedReader br1 = new BufferedReader(fr1); StringBuilder sb = new StringBuilder(); while (br1.ready()) { String line = br1.readLine(); sb.append(line).append("\n"); if ("</head>".equals(line)) { sb.append("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>") .append("\n"); } } br1.close(); tempFile2.delete(); meta2.put(META_CONTENTTYPE, CONTENT_TYPE_PDF); text = sb.toString(); Document doc = Jsoup.parse(text); if (doc != null) { meta2.put(META_TITLE, doc.select("title").text()); meta2.put(META_AUTHOR, getMetaContent(doc, "Author")); String creationDate = getMetaContent(doc, "CreationDate"); if (creationDate != null) { // 20130322143113Z00'00' -> 2013-03-22T14:31:13Z Pattern p = Pattern.compile("[0-9]{14}Z[0-9]{2}'[0-9]{2}'"); Matcher m = p.matcher(creationDate); if (m.find()) { String value = String.format("%1$s-%2$s-%3$sT%4$s:%5$s:%6$sZ", creationDate.substring(0, 4), creationDate.substring(4, 6), creationDate.substring(6, 8), creationDate.substring(8, 10), creationDate.substring(10, 12), creationDate.substring(12, 14)); meta2.put(META_CREATED, value); } else { // 20130322143113+02'00' -> 2013-03-22T14:31:13Z p = Pattern.compile("[0-9]{14}\\+[0-9]{2}'[0-9]{2}'"); m = p.matcher(creationDate); if (m.find()) { String value = String.format("%1$s-%2$s-%3$sT%4$s:%5$s:%6$sZ", creationDate.substring(0, 4), creationDate.substring(4, 6), creationDate.substring(6, 8), creationDate.substring(8, 10), creationDate.substring(10, 12), creationDate.substring(12, 14)); meta2.put(META_CREATED, value); } } } if (OUTPUT_FORMAT_TEXT.equals(outputFormat)) { Document doc2 = new Cleaner(Whitelist.basic()).clean(doc); text = doc2.body().text(); } } } } catch (Exception e) { if (tempFile != null && tempFile.exists()) tempFile.delete(); if (tempFile2 != null && tempFile2.exists()) tempFile2.delete(); e.printStackTrace(); text = null; meta2 = null; } } public void processWithSwfToHtml(InputStream input) { File tempFile = null; File tempFile2 = null; try { if (input != null && swfToHtmlPath != null && !"".equals(swfToHtmlPath)) { // Get a local copy of the file tempFile = File.createTempFile("tmp", ".swf"); if (!writeToFile(tempFile, input)) return; // Convert with SWF2HTML tempFile2 = File.createTempFile("tmp", ".html"); Shell sh = new Shell(); sh.exec(swfToHtmlPath, "-o", tempFile2.getAbsolutePath(), tempFile.getAbsolutePath()) .consumeAsString(); tempFile.delete(); String data = FileUtils.readFileToString(tempFile2, "UTF-8"); tempFile2.delete(); meta2 = new HashMap<String, String>(); meta2.put(META_CONTENTSIZE, String.valueOf(data.length())); meta2.put(META_CONTENTTYPE, CONTENT_TYPE_SWF); if (OUTPUT_FORMAT_TEXT.equals(outputFormat)) { if (formater != null) { data = formater.getPlainText(data); } else { data = Jsoup.parse(data).body().text(); } } text = data; } } catch (Exception e) { if (tempFile != null && tempFile.exists()) tempFile.delete(); if (tempFile2 != null && tempFile2.exists()) tempFile2.delete(); e.printStackTrace(); } } private void processWithDjVuText(InputStream input) { // TODO : http://djvu.sourceforge.net/doc/man/djvutxt.html // djvutxt inputdjvufile outputtxtfile // http://www.global-language.com/CENTURY/ File tempFile = null; File tempFile2 = null; try { if (input != null && djVuTextPath != null && !"".equals(djVuTextPath)) { // Get a local copy of the file tempFile = createTempFile("tmp", ".pdf", tmpPath); if (!writeToFile(tempFile, input)) return; // Convert with SWF2HTML tempFile2 = File.createTempFile("tmp", ".txt"); Shell sh = new Shell(); sh.exec(djVuTextPath, tempFile.getAbsolutePath(), tempFile2.getAbsolutePath()).consumeAsString(); tempFile.delete(); String data = FileUtils.readFileToString(tempFile2, "UTF-8"); tempFile2.delete(); text = data; } } catch (Exception e) { if (tempFile != null && tempFile.exists()) tempFile.delete(); if (tempFile2 != null && tempFile2.exists()) tempFile2.delete(); e.printStackTrace(); } } private static File createTempFile(String prefix, String suffix, String directory) throws IOException { File tmpFile = null; if (directory == null) directory = ""; if (!"".equals(directory)) tmpFile = new File(directory); if (tmpFile == null || !tmpFile.exists() || !tmpFile.isDirectory()) return File.createTempFile(prefix, suffix); else return File.createTempFile(prefix, suffix, tmpFile); } }