Java tutorial
/* * Copyright 2003-2004 Michael Franken, Zilverline. * * The contents of this file, or the files included with this file, are subject to * the current version of ZILVERLINE Collaborative Source License for the * Zilverline Search Engine (the "License"); You may not use this file except in * compliance with the License. * * You may obtain a copy of the License at * * http://www.zilverline.org. * * See the License for the rights, obligations and * limitations governing use of the contents of the file. * * The Original and Upgraded Code is the Zilverline Search Engine. The developer of * the Original and Upgraded Code is Michael Franken. Michael Franken owns the * copyrights in the portions it created. All Rights Reserved. * */ package org.zilverline.extractors; import java.io.File; import java.io.IOException; import java.io.Reader; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.springframework.util.StringUtils; import org.zilverline.core.Extractor; import org.zilverline.core.ParsedFileInfo; import org.zilverline.util.FileUtils; import org.zilverline.util.Utils; /** * Abstract baseclass of extractors. Extractors extract all relevant info from a File, and return the info in a ParsedFileInfo * Object. * * @author Michael Franken * @version $Revision: 1.20 $ * * @see org.zilverline.core.ParsedFileInfo */ public abstract class AbstractExtractor implements Extractor { /** default size of summary extracted from the file. */ private static final int SUMMARY_SIZE = 200; /** * logger for Commons logging. This is non-static final protected, such that it defines a log for all subclasses too. */ protected final Log log = LogFactory.getLog(getClass().getName()); private final static Log log2 = LogFactory.getLog(AbstractExtractor.class); /** default size of summary extracted from the file. */ private ParsedFileInfo fileInfo = new ParsedFileInfo(); /** * Set the file and all file related information of the document, such as length and modification date. * * @param f The file that is being parsed */ public final void setFile(final File f) { fileInfo.setFile(f); fileInfo.setSize(f.length()); fileInfo.setModificationDate(f.lastModified()); } /** * Set the type of the document. * * @param type such as EXCEL, PDF */ public final void setType(final String type) { fileInfo.setType(type); } /** * Set the author of the document. * * @param author the author */ public final void setAuthor(final String author) { fileInfo.setAuthor(author); } /** * Set the isbn number of the document. * * @param ISBN the ISBN number */ public final void setISBN(final String ISBN) { fileInfo.setISBN(ISBN); } /** * Set the title of the document. * * @param title the title */ public final void setTitle(final String title) { fileInfo.setTitle(title); } /** * Set the size of the document. * * @param size the size in bytes */ public final void setSize(final long size) { fileInfo.setSize(size); } /** * Set the modificationDate of the document. * * @param modificationDate the modificationDate in milliseconds since January 1, 1970, 00:00:00 GMT */ public final void setModificationDate(final long modificationDate) { fileInfo.setModificationDate(modificationDate); } /** * Set the creationDate of the document. * * @param creationDate the creationDate in milliseconds since January 1, 1970, 00:00:00 GMT */ public final void setCreationDate(final long creationDate) { fileInfo.setCreationDate(creationDate); } /** * Set the summary of the document. * * @param summary the summary */ public final void setSummary(final String summary) { fileInfo.setSummary(summary); } /** * Extract the content from the given file. As a side effect other attributes of ParsedFileInfo may be set too. * * Implementations should catch all checked exceptions, sensibly, And close all resources. * * @param f The file to extract the content from. * * @return Reader containing text-only content */ public abstract Reader getContent(final File f); /** * This method extracts all relevant info of the file as an ParsedFileInfo object. Uses getContent as callback. * * @param f the File to extract content from * * @return ParsedFileInfo the object containing relevant info of the provided file */ public final ParsedFileInfo extractInfo(final File f) { if (f == null) { log.warn("Something went terribly wrong, file = null, returning null "); return null; } try { setFile(f); Reader reader = getContent(f); fileInfo.setReader(reader); // get the summary from the reader if (reader != null) { String summary = fileInfo.getSummary(); if (!StringUtils.hasText(summary)) { char[] sumChars = new char[SUMMARY_SIZE]; int numChars = 0; try { if (reader.markSupported()) { reader.mark(SUMMARY_SIZE); numChars = reader.read(sumChars); reader.reset(); } if (numChars > 0) { summary = new String(sumChars, 0, numChars); } if (log.isDebugEnabled()) { log.debug("Summary extracted from reader: " + summary); } setSummary(getSummaryFromContent(summary)); } catch (IOException e) { log.warn("Error extracting summary form reader", e); } } } // Set the title if there's none yet if (!StringUtils.hasLength(fileInfo.getTitle())) { fileInfo.setTitle(FileUtils.getBasename(f)); } } catch (Exception e) { // here we don't throw any, since we do not want to interrupt the indexing process log.warn("Unexpected Error extracting content from " + f.getName(), e); } catch (OutOfMemoryError e) { // this happens with very, very large Documents log.error("Very Serious Error. Out of Memory for very large documents: " + f.getName() + ", try increasing your JVM heap size: for example, start your server with option '-Xmx128m'." + " Skipping file.", e); } catch (Throwable e) { log.error("Very Serious Error while extracting contents from: " + f.getName(), e); } return fileInfo; } /** * Get a ISBN number from the given text. * * @param text the plain text, can be null * @return a valid ISBNnumber (10 characters without -) or else "" */ public static String getISBNFromContent(final String text) { if (text == null) { return ""; } // ISBN:0764543857 String ISBNnumber = ""; int j; // does text contain ISBN or isbn? if (((j = text.indexOf("ISBN")) != -1) || (j = text.indexOf("isbn")) != -1) { // look 25 characters forward ISBNnumber = text.substring(j, j + 25); // remove ISBN.. (all text until first number) ISBNnumber = ISBNnumber.replaceFirst("[\\D]+", ""); // remove all non-valid ISBN characters (0-9xX and - seem valid), remove - as well ISBNnumber = ISBNnumber.replaceAll("[^0-9xX]", ""); if (ISBNnumber.length() > 10) { ISBNnumber = ISBNnumber.substring(0, 10); } log2.debug("possible ISBN found: " + ISBNnumber); if (!Utils.isValidISBNNumber(ISBNnumber)) { return ""; } } return ISBNnumber; } /** * Get a summary from the given text. * * @param text the plain text, can be null * @return the summary */ public static String getSummaryFromContent(final String text) { if (!StringUtils.hasText(text)) { return ""; } // alternative: just the first characters: String summary = text.substring(0, Math.min(text.length(), SUMMARY_SIZE)); // SimpleSummariser sum = new SimpleSummariser(); // get two representative lines // String summary = sum.summarise(text, 2); // return with minimal whitespace return summary.replaceAll("\\s+", " "); } }