Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.extractor; import java.io.IOException; import java.util.ArrayList; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.httpclient.URIException; import org.archive.modules.CrawlURI; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.PdfReaderContentParser; import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy; import com.itextpdf.text.pdf.parser.TextExtractionStrategy; /** * PDF Content Extractor. This will parse the text content of a PDF and apply a * regex to search for links within the body of the text. * * Requires itextpdf jar: http://repo1.maven.org/maven2/com/itextpdf/itextpdf/5.5.0/itextpdf-5.5.0.jar * * @contributor adam */ public class ExtractorPDFContent extends ContentExtractor { @SuppressWarnings("unused") private static final long serialVersionUID = 3L; private static final Logger LOGGER = Logger.getLogger(ExtractorPDFContent.class.getName()); public static final Pattern URLPattern = Pattern.compile("(?i)\\(?(https?):\\/\\/" + // protocol "(([a-z0-9$_\\.\\+!\\*\\'\\(\\),;\\?&=-]|%[0-9a-f]{2})+" + // username "(:([a-z0-9$_\\.\\+!\\*\\'\\(\\),;\\?&=-]|%[0-9a-f]{2})+)?" + // password "@)?(?" + // auth requires @ ")((([a-z0-9]\\.|[a-z0-9][a-z0-9-]*[a-z0-9]\\.)*" + // domain segments AND "[a-z][a-z0-9-]*[a-z0-9]" + // top level domain OR "|((\\d|[1-9]\\d|1\\d{2}|2[0-4][0-9]|25[0-5])\\.){3}" + "(\\d|[1-9]\\d|1\\d{2}|2[0-4][0-9]|25[0-5])" + // IP address ")(:\\d+)?)" + // port "(((\\/+([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)*" + // path "(\\?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)?)?)?" + // query string "(\\n(?!http://)" + // possible newline (seems to happen in pdfs) "((\\/)?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)*" + // continue possible path "(\\?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)?" + // or possible query ")?"); /** * The maximum size of PDF files to consider. PDFs larger than this * maximum will not be searched for links. */ { setMaxSizeToParse(10 * 1024 * 1024L); // 10MB } public long getMaxSizeToParse() { return (Long) kp.get("maxSizeToParse"); } public void setMaxSizeToParse(long threshold) { kp.put("maxSizeToParse", threshold); } public ExtractorPDFContent() { } protected boolean innerExtract(CrawlURI curi) { PdfReader documentReader; ArrayList<String> uris = new ArrayList<String>(); try { documentReader = new PdfReader(curi.getRecorder().getContentReplayInputStream()); for (int i = 1; i <= documentReader.getNumberOfPages(); i++) { //Page numbers start at 1 String pageParseText = extractPageText(documentReader, i); Matcher matcher = URLPattern.matcher(pageParseText); while (matcher.find()) { String prospectiveURL = pageParseText.substring(matcher.start(), matcher.end()).trim(); //handle URLs wrapped in parentheses if (prospectiveURL.startsWith("(")) { prospectiveURL = prospectiveURL.substring(1, prospectiveURL.length()); if (prospectiveURL.endsWith(")")) prospectiveURL = prospectiveURL.substring(0, prospectiveURL.length() - 1); } uris.add(prospectiveURL); //parsetext URLs tend to end in a '.' if they are in a sentence, queue without trailing '.' if (prospectiveURL.endsWith(".") && prospectiveURL.length() > 2) uris.add(prospectiveURL.substring(0, prospectiveURL.length() - 1)); //Full regex allows newlines which seem to be common, also add match without newline in case we are wrong if (matcher.group(19) != null) { String alternateURL = matcher.group(1) + "://" + (matcher.group(2) != null ? matcher.group(2) : "") + matcher.group(6) + matcher.group(13); //Again, handle URLs wrapped in parentheses if (prospectiveURL.startsWith("(") && alternateURL.endsWith(")")) alternateURL = alternateURL.substring(0, alternateURL.length() - 1); uris.add(alternateURL); } } } } catch (IOException e) { curi.getNonFatalFailures().add(e); return false; } catch (RuntimeException e) { curi.getNonFatalFailures().add(e); return false; } if (uris.size() < 1) { return true; } for (String uri : uris) { try { LinkContext lc = LinkContext.NAVLINK_MISC; Hop hop = Hop.NAVLINK; CrawlURI out = curi.createCrawlURI(uri, lc, hop); curi.getOutLinks().add(out); } catch (URIException e1) { logUriError(e1, curi.getUURI(), uri); } } numberOfLinksExtracted.addAndGet(uris.size()); LOGGER.fine(curi + " has " + uris.size() + " links."); // Set flag to indicate that link extraction is completed. return true; } public String extractPageText(PdfReader documentReader, int pageNum) { String content = ""; PdfReaderContentParser parser = new PdfReaderContentParser(documentReader); TextExtractionStrategy strat; try { strat = parser.processContent(pageNum, new SimpleTextExtractionStrategy()); content = strat.getResultantText(); } catch (IOException e) { LOGGER.log(Level.WARNING, "Failed to parse pdf text in " + Thread.currentThread().getName(), e); } return content; } @Override protected boolean shouldExtract(CrawlURI uri) { long max = getMaxSizeToParse(); if (uri.getRecorder().getRecordedInput().getSize() > max) { return false; } String ct = uri.getContentType(); return (ct != null) && (ct.startsWith("application/pdf")); } }