Java tutorial
/* * DocumentContentImpl.java * * Copyright (c) 1995-2012, The University of Sheffield. See the file * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt * * This file is part of GATE (see http://gate.ac.uk/), and is free * software, licenced under the GNU Library General Public License, * Version 2, June 1991 (in the distribution as file licence.html, * and also available at http://gate.ac.uk/gate/licence.html). * * Hamish Cunningham, 11/Feb/2000 * * $Id$ */ package gate.corpora; import gate.DocumentContent; import gate.util.BomStrippingInputStreamReader; import gate.util.InvalidOffsetException; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.util.zip.GZIPInputStream; import org.apache.commons.io.IOUtils; /** Represents the commonalities between all sorts of document contents. */ public class DocumentContentImpl implements DocumentContent { /** Buffer size for reading * 16k is 4 times the block size on most filesystems * so it should be efficient for most cases * */ private static final int INTERNAL_BUFFER_SIZE = 16 * 1024; /** Default construction */ public DocumentContentImpl() { content = new String(); } // default construction /** Contruction from URL and offsets. */ public DocumentContentImpl(URL u, String encoding, Long start, Long end) throws IOException { int readLength = 0; char[] readBuffer = new char[INTERNAL_BUFFER_SIZE]; BufferedReader uReader = null; InputStream uStream = null; StringBuffer buf = new StringBuffer(); long s = 0, e = Long.MAX_VALUE; if (start != null && end != null) { s = start.longValue(); e = end.longValue(); } try { URLConnection conn = u.openConnection(); uStream = conn.getInputStream(); if ("gzip".equals(conn.getContentEncoding())) { uStream = new GZIPInputStream(uStream); } if (encoding != null && !encoding.equalsIgnoreCase("")) { uReader = new BomStrippingInputStreamReader(uStream, encoding, INTERNAL_BUFFER_SIZE); } else { uReader = new BomStrippingInputStreamReader(uStream, INTERNAL_BUFFER_SIZE); } ; // 1. skip S characters uReader.skip(s); // 2. how many character shall I read? long toRead = e - s; // 3. read gtom source into buffer while (toRead > 0 && (readLength = uReader.read(readBuffer, 0, INTERNAL_BUFFER_SIZE)) != -1) { if (toRead < readLength) { //well, if toRead(long) is less than readLenght(int) //then there can be no overflow, so the cast is safe readLength = (int) toRead; } buf.append(readBuffer, 0, readLength); toRead -= readLength; } } finally { // 4.close reader IOUtils.closeQuietly(uReader); IOUtils.closeQuietly(uStream); } content = new String(buf); originalContent = content; } // Contruction from URL and offsets /** Propagate changes to the document content. */ void edit(Long start, Long end, DocumentContent replacement) { int s = start.intValue(), e = end.intValue(); String repl = replacement == null ? "" : ((DocumentContentImpl) replacement).content; StringBuffer newContent = new StringBuffer(content); newContent.replace(s, e, repl); content = newContent.toString(); } // edit(start,end,replacement) @Override public DocumentContent getContent(Long start, Long end) throws InvalidOffsetException { if (!isValidOffsetRange(start, end)) throw new InvalidOffsetException("Invalid offset range " + start + " to " + end + " for document content of length " + this.size()); return new DocumentContentImpl(content.substring(start.intValue(), end.intValue())); } // getContent(start, end) /** Returns the String representing the content in case of a textual document. * NOTE: this is a temporary solution until we have a more generic one. */ @Override public String toString() { return content; } /** The size of this content (e.g. character length for textual * content). */ @Override public Long size() { return new Long(content.length()); } // size() /** Check that an offset is valid */ boolean isValidOffset(Long offset) { if (offset == null) return false; long o = offset.longValue(); long len = content.length(); if (o > len || o < 0) return false; return true; } // isValidOffset /** Check that both start and end are valid offsets and that * they constitute a valid offset range */ boolean isValidOffsetRange(Long start, Long end) { return isValidOffset(start) && isValidOffset(end) && start.longValue() <= end.longValue(); } // isValidOffsetRange(start,end) /** Two documents are the same if their contents is the same */ @Override public boolean equals(Object other) { if (!(other instanceof DocumentContentImpl)) return false; DocumentContentImpl docImpl = (DocumentContentImpl) other; return content.equals(docImpl.toString()); } // equals /** Calculate the hash value for the object. */ @Override public int hashCode() { return toString().hashCode(); } /** Just for now - later we have to cater for different types of * content. */ String content; /** * For preserving the original content of the document. * The edit command didn't affect on the original content. * If you construct the content by URL the originalContent will keep * whole information retrieved by URL even you set some start and end. */ String originalContent; /** * Return the original content of the document received during the loading * phase or on construction from string. */ public String getOriginalContent() { return originalContent; } /** For ranges */ public DocumentContentImpl(String s) { content = s; originalContent = content; } /** Freeze the serialization UID. */ static final long serialVersionUID = -1426940535575467461L; } // class DocumentContentImpl