Java tutorial
/* * DocumentFormat.java * * Copyright (c) 1995-2012, The University of Sheffield. See the file * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt * * This file is part of GATE (see http://gate.ac.uk/), and is free * software, licenced under the GNU Library General Public License, * Version 2, June 1991 (in the distribution as file licence.html, * and also available at http://gate.ac.uk/gate/licence.html). * * Hamish Cunningham, 25/May/2000 * * $Id$ */ package gate; import gate.corpora.MimeType; import gate.corpora.RepositioningInfo; import gate.creole.AbstractLanguageResource; import gate.event.StatusListener; import gate.util.BomStrippingInputStreamReader; import gate.util.DocumentFormatException; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLConnection; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import java.util.Vector; import org.apache.commons.io.IOUtils; /** The format of Documents. Subclasses of DocumentFormat know about * particular MIME types and how to unpack the information in any * markup or formatting they contain into GATE annotations. Each MIME * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves * with a static index residing here when they are constructed. Static * getDocumentFormat methods can then be used to get the appropriate * format class for a particular document. */ public abstract class DocumentFormat extends AbstractLanguageResource implements LanguageResource { private static final long serialVersionUID = 4147880563349143923L; /** The MIME type of this format. */ private MimeType mimeType = null; /** Map of MimeTypeString to ClassHandler class. This is used to find the * language resource that deals with the specific Document format */ protected static final Map<String, DocumentFormat> mimeString2ClassHandlerMap = new HashMap<String, DocumentFormat>(); /** Map of MimeType to DocumentFormat Class. This is used to find the * DocumentFormat subclass that deals with a particular MIME type. */ protected static final Map<String, MimeType> mimeString2mimeTypeMap = new HashMap<String, MimeType>(); /** Map of Set of file suffixes to MimeType. This is used to figure * out what MIME type a document is from its file name. */ protected static final Map<String, MimeType> suffixes2mimeTypeMap = new HashMap<String, MimeType>(); /** Map of Set of magic numbers to MimeType. This is used to guess the * MIME type of a document, when we don't have any other clues. */ protected static final Map<String, MimeType> magic2mimeTypeMap = new HashMap<String, MimeType>(); /** Map of markup elements to annotation types. If it is null, the * unpackMarkup() method will convert all markup, using the element names * for annotation types. If it is non-null, only those elements specified * here will be converted. */ protected Map<String, String> markupElementsMap = null; /** This map is used inside uppackMarkup() method... * When an element from the map is encounted, The corresponding string * element is added to the document content */ protected Map<String, String> element2StringMap = null; /** The features of this resource */ private FeatureMap features = null; /** Default construction */ public DocumentFormat() { } /** listeners for status report */ private transient Vector<StatusListener> statusListeners; /** Flag for enable/disable collecting of repositioning information */ private Boolean shouldCollectRepositioning = new Boolean(false); /** If the document format could collect repositioning information * during the unpack phase this method will return <B>true</B>. * <BR> * You should override this method in the child class of the defined * document format if it could collect the repositioning information. */ public Boolean supportsRepositioning() { return new Boolean(false); } // supportsRepositioning public void setShouldCollectRepositioning(Boolean b) { if (supportsRepositioning().booleanValue() && b.booleanValue()) { shouldCollectRepositioning = b; } else { shouldCollectRepositioning = new Boolean(false); } // if } // setShouldCollectRepositioning public Boolean getShouldCollectRepositioning() { return shouldCollectRepositioning; } // /** Unpack the markup in the document. This converts markup from the * native format (e.g. XML, RTF) into annotations in GATE format. * Uses the markupElementsMap to determine which elements to convert, and * what annotation type names to use. */ abstract public void unpackMarkup(Document doc) throws DocumentFormatException; abstract public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException; /** Unpack the markup in the document. This method calls unpackMarkup on the * GATE document, but after it saves its content as a feature attached to * the document. This method is useful if one wants to save the content * of the document being unpacked. After the markups have been unpacked, * the content of the document will be replaced with a new one containing * the text between markups. * * @param doc the document that will be unpacked * @param originalContentFeatureType the name of the feature that will hold * the document's content. */ public void unpackMarkup(Document doc, String originalContentFeatureType) throws DocumentFormatException { FeatureMap fm = doc.getFeatures(); if (fm == null) fm = Factory.newFeatureMap(); fm.put(originalContentFeatureType, doc.getContent().toString()); doc.setFeatures(fm); unpackMarkup(doc); }// unpackMarkup(); /** * Returns a MimeType having as input a fileSufix. * If the file sufix is <b>null</b> or not recognised then, * <b>null</b> will be returned. * @param fileSufix The file sufix associated with a recognisabe mime type. * @return The MimeType associated with this file suffix. */ static private MimeType getMimeType(String fileSufix) { // Get a mimeType string associated with this fileSuffix // Eg: for html returns MimeType("text/html"), for xml returns // MimeType("text/xml") if (fileSufix == null) return null; return suffixes2mimeTypeMap.get(fileSufix.toLowerCase()); }//getMimeType public static Set<String> getSupportedMimeTypes() { return Collections.unmodifiableSet(mimeString2mimeTypeMap.keySet()); } /** * Returns a MymeType having as input a URL object. If the MimeType wasn't * recognized it returns <b>null</b>. * @param url The URL object from which the MimeType will be extracted * @return A MimeType object for that URL, or <b>null</b> if the Mime Type is * unknown. */ static private MimeType getMimeType(URL url) { String mimeTypeString = null; String charsetFromWebServer = null; String contentType = null; InputStream is = null; MimeType mimeTypeFromWebServer = null; MimeType mimeTypeFromFileSuffix = null; MimeType mimeTypeFromMagicNumbers = null; if (url == null) return null; // Ask the web server for the content type // We expect to get contentType something like this: // "text/html; charset=iso-8859-1" // Charset is optional try { try { URLConnection urlconn = url.openConnection(); is = urlconn.getInputStream(); contentType = urlconn.getContentType(); } catch (IOException e) { // Failed to get the content type with te Web server. // Let's try some other methods like FileSuffix or magic numbers. } // If a content Type was returned by the server, try to get the mime Type // string // If contentType is something like this:"text/html; charset=iso-8859-1" // try to get content Type string (text/html) if (contentType != null) { StringTokenizer st = new StringTokenizer(contentType, ";"); // We assume that the first token is the mime type string... // If this doesn't happen then BAD LUCK :(( ... if (st.hasMoreTokens()) mimeTypeString = st.nextToken().toLowerCase(); // The next token it should be the CharSet if (st.hasMoreTokens()) charsetFromWebServer = st.nextToken().toLowerCase(); if (charsetFromWebServer != null) { //We have something like : "charset=iso-8859-1" and let's extract the // encoding. st = new StringTokenizer(charsetFromWebServer, "="); // Don't need this anymore charsetFromWebServer = null; // Discarding the first token which is : "charset" if (st.hasMoreTokens()) st.nextToken(); // Get the encoding : "ISO-8859-1" if (st.hasMoreTokens()) charsetFromWebServer = st.nextToken().toUpperCase(); } // End if } // end if // Return the corresponding MimeType with WebServer from the associated MAP mimeTypeFromWebServer = mimeString2mimeTypeMap.get(mimeTypeString); // Let's try a file suffix detection // mimeTypeFromFileSuffix = getMimeType(getFileSuffix(url)); for (String suffix : getFileSuffixes(url)) { mimeTypeFromFileSuffix = getMimeType(suffix); if (mimeTypeFromFileSuffix != null) break; } // Let's perform a magic numbers guess.. mimeTypeFromMagicNumbers = guessTypeUsingMagicNumbers(is, charsetFromWebServer); } finally { IOUtils.closeQuietly(is); //null safe } //All those types enter into a deciding system return decideBetweenThreeMimeTypes(mimeTypeFromWebServer, mimeTypeFromFileSuffix, mimeTypeFromMagicNumbers); }//getMimeType /** * This method decides what mimeType is in majority * @param aMimeTypeFromWebServer a MimeType * @param aMimeTypeFromFileSuffix a MimeType * @param aMimeTypeFromMagicNumbers a MimeType * @return the MimeType which occurs most. If all are null, then returns * <b>null</b> */ protected static MimeType decideBetweenThreeMimeTypes(MimeType aMimeTypeFromWebServer, MimeType aMimeTypeFromFileSuffix, MimeType aMimeTypeFromMagicNumbers) { // First a voting system if (areEqual(aMimeTypeFromWebServer, aMimeTypeFromFileSuffix)) return aMimeTypeFromFileSuffix; if (areEqual(aMimeTypeFromFileSuffix, aMimeTypeFromMagicNumbers)) return aMimeTypeFromFileSuffix; if (areEqual(aMimeTypeFromWebServer, aMimeTypeFromMagicNumbers)) return aMimeTypeFromWebServer; // 1 is the highest priority if (aMimeTypeFromFileSuffix != null) aMimeTypeFromFileSuffix.addParameter("Priority", "1"); // 2 is the second priority if (aMimeTypeFromWebServer != null) aMimeTypeFromWebServer.addParameter("Priority", "2"); // 3 is the third priority if (aMimeTypeFromMagicNumbers != null) aMimeTypeFromMagicNumbers.addParameter("Priority", "3"); return decideBetweenTwoMimeTypes(decideBetweenTwoMimeTypes(aMimeTypeFromWebServer, aMimeTypeFromFileSuffix), aMimeTypeFromMagicNumbers); }// decideBetweenThreeMimeTypes /** Decide between two mimeTypes. The decistion is made on "Priority" * parameter set into decideBetweenThreeMimeTypes method. If both mimeTypes * doesn't have "Priority" paramether set, it will return one on them. * @param aMimeType a MimeType object with "Prority" parameter set * @param anotherMimeType a MimeType object with "Prority" parameter set * @return One of the two mime types. */ protected static MimeType decideBetweenTwoMimeTypes(MimeType aMimeType, MimeType anotherMimeType) { if (aMimeType == null) return anotherMimeType; if (anotherMimeType == null) return aMimeType; int priority1 = 0; int priority2 = 0; // Both of them are not null if (aMimeType.hasParameter("Priority")) try { priority1 = new Integer(aMimeType.getParameterValue("Priority")).intValue(); } catch (NumberFormatException e) { return anotherMimeType; } if (anotherMimeType.hasParameter("Priority")) try { priority2 = new Integer(anotherMimeType.getParameterValue("Priority")).intValue(); } catch (NumberFormatException e) { return aMimeType; } // The lower the number, the highest the priority if (priority1 <= priority2) return aMimeType; else return anotherMimeType; }// decideBetweenTwoMimeTypes /** * Tests if two MimeType objects are equal. * @return true only if boths MimeType objects are different than <b>null</b> * and their Types and Subtypes are equals. The method is case sensitive. */ protected static boolean areEqual(MimeType aMimeType, MimeType anotherMimeType) { if (aMimeType == null || anotherMimeType == null) return false; if (aMimeType.getType().equals(anotherMimeType.getType()) && aMimeType.getSubtype().equals(anotherMimeType.getSubtype())) return true; else return false; }// are Equal /** * This method tries to guess the mime Type using some magic numbers. * @param aInputStream a InputStream which has to be transformed into a * InputStreamReader * @param anEncoding the encoding. If is null or unknown then a * InputStreamReader with default encodings will be created. * @return the mime type associated with magic numbers */ protected static MimeType guessTypeUsingMagicNumbers(InputStream aInputStream, String anEncoding) { if (aInputStream == null) return null; Reader reader = null; if (anEncoding != null) try { reader = new BomStrippingInputStreamReader(aInputStream, anEncoding); } catch (UnsupportedEncodingException e) { reader = null; } if (reader == null) // Create a reader with the default encoding system reader = new BomStrippingInputStreamReader(aInputStream); // We have a input stream reader return runMagicNumbers(reader); }//guessTypeUsingMagicNumbers /** Performs magic over Gate Document */ protected static MimeType runMagicNumbers(Reader aReader) { // No reader, nothing to detect if (aReader == null) return null; // Prepare to run the magic stuff String strBuffer = null; int bufferSize = 2048; int charReads = 0; char[] cbuf = new char[bufferSize]; try { charReads = aReader.read(cbuf, 0, bufferSize); } catch (IOException e) { return null; } // End try if (charReads == -1) // the document is empty return null; // Create a string form the buffer and perform some search on it. strBuffer = new String(cbuf, 0, charReads); // If this fails then surrender return getTypeFromContent(strBuffer); }// runMagicNumbers private static MimeType getTypeFromContent(String aContent) { // change case to cover more variants aContent = aContent.toLowerCase(); // the mime type we have detected (null to start with) MimeType detectedMimeType = null; // the offset of the first match now we use a "first wins" priority int firstOffset = Integer.MAX_VALUE; // Run the magic numbers test for (Map.Entry<String, MimeType> kv : magic2mimeTypeMap.entrySet()) { // the magic code we are looking for String magic = kv.getKey().toLowerCase(); // the offset of this code in the content int offset = aContent.indexOf(magic.toLowerCase()); if (offset != -1 && offset < firstOffset) { // if the magic code exists in the doc and appears before any others // than use that mime type detectedMimeType = kv.getValue(); firstOffset = offset; } } // return the mime type (null if we failed) return detectedMimeType; } /** * Return the fileSuffix or null if the url doesn't have a file suffix * If the url is null then the file suffix will be null also */ @SuppressWarnings("unused") private static String getFileSuffix(URL url) { String fileName = null; String fileSuffix = null; // GIGO test (garbage in garbage out) if (url != null) { // get the file name from the URL fileName = url.getFile(); // tokenize this file name with "." as separator... // the last token will be the file suffix StringTokenizer st = new StringTokenizer(fileName, "."); // fileSuffix is the last token while (st.hasMoreTokens()) fileSuffix = st.nextToken(); // here fileSuffix is the last token } // End if return fileSuffix; }//getFileSufix /** * Given a URL, this method returns all the 'file extensions' for the file * part of the URL. For this purposes, a 'file extension' is any sequence of * .-separated tokens (such as .gate.xml.gz). The order the extensions are * returned in is from the most specific (longest) to the most generic * (shortest) one, e.g. [.gate.xml.gz, .xml.gz, .gz]. */ private static List<String> getFileSuffixes(URL url) { List<String> res = new LinkedList<String>(); if (url != null) { // get the file name from the URL String fileName = url.getPath(); int pos = fileName.lastIndexOf('/'); if (pos > 0) fileName = fileName.substring(pos); pos = fileName.indexOf('.', 1); while (pos > 0 && pos < fileName.length() - 1) { res.add(fileName.substring(pos + 1)); pos = fileName.indexOf('.', pos + 1); } } return res; } /** * Find a DocumentFormat implementation that deals with a particular * MIME type, given that type. * @param aGateDocument this document will receive as a feature * the associated Mime Type. The name of the feature is * MimeType and its value is in the format type/subtype * @param mimeType the mime type that is given as input */ static public DocumentFormat getDocumentFormat(gate.Document aGateDocument, MimeType mimeType) { FeatureMap aFeatureMap = null; if (mimeType == null) { String content = aGateDocument.getContent().toString(); // reduce size for better performance if (content.length() > 2048) content = content.substring(0, 2048); mimeType = getTypeFromContent(content); } if (mimeType != null) { // If the Gate Document doesn't have a feature map atached then // We will create and set one. if (aGateDocument.getFeatures() == null) { aFeatureMap = Factory.newFeatureMap(); aGateDocument.setFeatures(aFeatureMap); } // end if aGateDocument.getFeatures().put("MimeType", mimeType.getType() + "/" + mimeType.getSubtype()); return mimeString2ClassHandlerMap.get(mimeType.getType() + "/" + mimeType.getSubtype()); } // end If return null; } // getDocumentFormat(aGateDocument, MimeType) /** * Find a DocumentFormat implementation that deals with a particular * MIME type, given the file suffix (e.g. ".txt") that the document came * from. * @param aGateDocument this document will receive as a feature * the associated Mime Type. The name of the feature is * MimeType and its value is in the format type/subtype * @param fileSuffix the file suffix that is given as input */ static public DocumentFormat getDocumentFormat(gate.Document aGateDocument, String fileSuffix) { return getDocumentFormat(aGateDocument, getMimeType(fileSuffix)); } // getDocumentFormat(String) /** * Find the DocumentFormat implementation that deals with the given * MIME type. * * @param mimeType the MIME type you want the DocumentFormat for * @return the DocumentFormat associated with the MIME type or null if * the MIME type does not have a registered DocumentFormat */ public static DocumentFormat getDocumentFormat(MimeType mimeType) { return mimeString2ClassHandlerMap.get(mimeType.getType() + "/" + mimeType.getSubtype()); } /** * Find a DocumentFormat implementation that deals with a particular * MIME type, given the URL of the Document. If it is an HTTP URL, we * can ask the web server. If it has a recognised file extension, we * can use that. Otherwise we need to use a map of magic numbers * to MIME types to guess the type, and then look up the format using the * type. * @param aGateDocument this document will receive as a feature * the associated Mime Type. The name of the feature is * MimeType and its value is in the format type/subtype * @param url the URL that is given as input */ static public DocumentFormat getDocumentFormat(gate.Document aGateDocument, URL url) { return getDocumentFormat(aGateDocument, getMimeType(url)); } // getDocumentFormat(URL) /** Get the feature set */ @Override public FeatureMap getFeatures() { return features; } /** Get the markup elements map */ public Map<String, String> getMarkupElementsMap() { return markupElementsMap; } /** Get the element 2 string map */ public Map<String, String> getElement2StringMap() { return element2StringMap; } /** Set the markup elements map */ public void setMarkupElementsMap(Map<String, String> markupElementsMap) { this.markupElementsMap = markupElementsMap; } /** Set the element 2 string map */ public void setElement2StringMap(Map<String, String> anElement2StringMap) { element2StringMap = anElement2StringMap; } /** Set the features map*/ @Override public void setFeatures(FeatureMap features) { this.features = features; } /** Set the mime type*/ public void setMimeType(MimeType aMimeType) { mimeType = aMimeType; } /** Gets the mime Type*/ public MimeType getMimeType() { return mimeType; } /** * Utility method to get a {@link MimeType} given the type string. */ public static MimeType getMimeTypeForString(String typeString) { return mimeString2mimeTypeMap.get(typeString); } /** * Utility method to get the set of all file suffixes that are registered * with this class. */ public static Set<String> getSupportedFileSuffixes() { return Collections.unmodifiableSet(suffixes2mimeTypeMap.keySet()); } //StatusReporter Implementation public synchronized void removeStatusListener(StatusListener l) { if (statusListeners != null && statusListeners.contains(l)) { @SuppressWarnings("unchecked") Vector<StatusListener> v = (Vector<StatusListener>) statusListeners.clone(); v.removeElement(l); statusListeners = v; } } public synchronized void addStatusListener(StatusListener l) { @SuppressWarnings("unchecked") Vector<StatusListener> v = statusListeners == null ? new Vector<StatusListener>(2) : (Vector<StatusListener>) statusListeners.clone(); if (!v.contains(l)) { v.addElement(l); statusListeners = v; } } protected void fireStatusChanged(String e) { if (statusListeners != null) { int count = statusListeners.size(); for (int i = 0; i < count; i++) { statusListeners.elementAt(i).statusChanged(e); } } } } // class DocumentFormat