Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.uima.ruta.resource; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import java.util.zip.ZipOutputStream; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.ruta.RutaStream; import org.apache.uima.ruta.type.RutaBasic; import org.springframework.core.io.FileSystemResource; import org.springframework.core.io.Resource; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.XMLReader; public class TreeWordList implements RutaWordList { private TextNode root; private String name; private boolean dictRemoveWS = false; /** * Default constructor */ public TreeWordList() { this.root = null; } /** * Constructs a TreeWordList from a resource. * * @param resource * Resource to create a TextWordList from * @throws IllegalArgumentException * When {@code resource.getFileName()} is null or does not end with .txt or .twl. */ public TreeWordList(Resource resource, boolean dictRemoveWS) throws IOException { this.dictRemoveWS = dictRemoveWS; final String name = resource.getFilename(); InputStream stream = null; try { stream = resource.getInputStream(); if (name == null) { throw new IllegalArgumentException("List does not have a name."); } else if (name.endsWith(".txt")) { buildNewTree(stream); } else if (name.endsWith(".twl")) { readXML(stream, "UTF-8"); } else { throw new IllegalArgumentException("File name should end with .twl or .txt, found " + name); } } finally { if (stream != null) { stream.close(); } } this.name = name; } /** * Constructs a TreeWordList from a file with path = filename * * @param pathname * path of the file to create a TextWordList from */ public TreeWordList(String pathname, boolean dictRemoveWS) throws IOException { this(new FileSystemResource(pathname), dictRemoveWS); } /** * Constructs a TreeWordList from an open stream with a given name * * @param stream * path of the file to create a TextWordList from */ public TreeWordList(InputStream stream, String name, boolean dictRemoveWS) throws IOException { this.dictRemoveWS = dictRemoveWS; if (name.endsWith(".twl")) { readXML(stream, "UTF-8"); } if (name.endsWith(".txt")) { buildNewTree(stream); } this.name = new File(name).getName(); } public TreeWordList(List<String> data, boolean dictRemoveWS) { name = "local"; this.dictRemoveWS = dictRemoveWS; buildNewTree(data); } public void buildNewTree(List<String> data) { this.root = new TextNode(); for (String s : data) { addWord(s); } } /** * Creates a new Tree in the existing treeWordList from a file with path pathname * * @param stream * Open InputStream containing the word for the treeWordList, this method will close the * stream. */ public void buildNewTree(InputStream stream) throws IOException { Scanner scan = new Scanner(stream, "UTF-8"); // creating a new tree this.root = new TextNode(); while (scan.hasNextLine()) { String s = scan.nextLine().trim(); // HOTFIX for old formats if (s.endsWith("=")) { s = s.substring(0, s.length() - 1); s = s.trim(); } addWord(s); } scan.close(); } /** * Returns the root node of the tree * * @return the root node */ public TextNode getRoot() { return this.root; } /** * Add a new String into the TreeWordList * * @param s * The String to add */ public void addWord(String s) { // Create Nodes from all chars of the strings besides the last one TextNode pointer = root; for (Character each : s.toCharArray()) { if (dictRemoveWS && Character.isWhitespace(each)) { continue; } TextNode childNode = pointer.getChildNode(each); if (childNode == null) { childNode = new TextNode(each, false); pointer.addChild(childNode); } pointer = childNode; } pointer.setWordEnd(s.length() > 0); } /** * Checks if TreeWordList contains String s */ public boolean contains(String s, boolean ignoreCase, int size, char[] ignoreChars, int maxIgnoreChars, boolean ignoreWS) { if (s == null) { return false; } TextNode pointer = root; return recursiveContains(pointer, s, 0, ignoreCase && s.length() > size, false, ignoreChars, maxIgnoreChars, ignoreWS); } public boolean containsFragment(String s, boolean ignoreCase, int size, char[] ignoreChars, int maxIgnoreChars, boolean ignoreWS) { TextNode pointer = root; return recursiveContains(pointer, s, 0, ignoreCase && s.length() > size, true, ignoreChars, maxIgnoreChars, ignoreWS); } private boolean recursiveContains(TextNode pointer, String text, int index, boolean ignoreCase, boolean fragment, char[] ignoreChars, int maxIgnoreChars, boolean ignoreWS) { if (pointer == null) { return false; } if (index == text.length()) { return fragment || pointer.isWordEnd(); } char charAt = text.charAt(index); boolean charAtIgnored = false; if (ignoreChars != null) { for (char each : ignoreChars) { if (each == charAt) { charAtIgnored = true; maxIgnoreChars--; break; } } charAtIgnored &= index != 0; if (maxIgnoreChars < 0) { return false; } } int next = ++index; boolean result = false; if (ignoreCase) { TextNode childNodeL = pointer.getChildNode(Character.toLowerCase(charAt)); TextNode childNodeU = pointer.getChildNode(Character.toUpperCase(charAt)); TextNode wsNode = pointer.getChildNode(' '); if (ignoreWS && wsNode != null) { result |= recursiveContains(wsNode, text, --next, ignoreCase, fragment, ignoreChars, maxIgnoreChars, ignoreWS); } if (childNodeL == null && ignoreWS) { childNodeL = skipWS(pointer, charAt); } if (childNodeU == null && ignoreWS) { childNodeU = skipWS(pointer, charAt); } if (charAtIgnored && childNodeL == null && childNodeU == null) { result |= recursiveContains(pointer, text, next, ignoreCase, fragment, ignoreChars, maxIgnoreChars, ignoreWS); } else { result |= recursiveContains(childNodeL, text, next, ignoreCase, fragment, ignoreChars, maxIgnoreChars, ignoreWS) | recursiveContains(childNodeU, text, next, ignoreCase, fragment, ignoreChars, maxIgnoreChars, ignoreWS); } } else { TextNode wsNode = pointer.getChildNode(' '); if (ignoreWS && wsNode != null) { result |= recursiveContains(wsNode, text, --next, ignoreCase, fragment, ignoreChars, maxIgnoreChars, ignoreWS); } TextNode childNode = pointer.getChildNode(charAt); if (childNode == null && ignoreWS) { childNode = skipWS(pointer, charAt); } if (charAtIgnored && childNode == null) { result |= recursiveContains(pointer, text, next, ignoreCase, fragment, ignoreChars, maxIgnoreChars, ignoreWS); } else { result |= recursiveContains(childNode, text, next, ignoreCase, fragment, ignoreChars, maxIgnoreChars, ignoreWS); } } return result; } private TextNode skipWS(TextNode pointer, char charAt) { TextNode childNode = pointer.getChildNode(' '); if (childNode != null) { TextNode node = childNode.getChildNode(charAt); if (node == null) { return skipWS(childNode, charAt); } else { return node; } } return null; } public List<AnnotationFS> find(RutaStream stream, boolean ignoreCase, int size, char[] ignoreChars, int maxIgnoredChars, boolean ignoreWS) { ArrayList<AnnotationFS> results = new ArrayList<AnnotationFS>(); stream.moveToFirst(); FSIterator<AnnotationFS> streamPointer = stream.copy(); while (stream.isValid()) { RutaBasic anchorBasic = (RutaBasic) stream.get(); streamPointer.moveTo(anchorBasic); List<RutaBasic> basicsToAdd = new ArrayList<RutaBasic>(); basicsToAdd.add(anchorBasic); String text = anchorBasic.getCoveredText(); StringBuilder candidate = new StringBuilder(text); // String lastCandidate = candidate.toString(); Annotation interResult = null; while (streamPointer.isValid()) { if (containsFragment(candidate.toString(), ignoreCase, size, ignoreChars, maxIgnoredChars, ignoreWS)) { streamPointer.moveToNext(); if (streamPointer.isValid()) { RutaBasic next = (RutaBasic) streamPointer.get(); if (contains(candidate.toString(), ignoreCase, size, ignoreChars, maxIgnoredChars, ignoreWS)) { interResult = new Annotation(stream.getJCas(), basicsToAdd.get(0).getBegin(), basicsToAdd.get(basicsToAdd.size() - 1).getEnd()); } // lastCandidate = candidate.toString(); candidate.append(next.getCoveredText()); basicsToAdd.add(next); } else { tryToCreateAnnotation(stream, ignoreCase, size, results, basicsToAdd, candidate.toString(), interResult, ignoreChars, maxIgnoredChars, ignoreWS); } } else { basicsToAdd.remove(basicsToAdd.size() - 1); tryToCreateAnnotation(stream, ignoreCase, size, results, basicsToAdd, candidate.toString(), interResult, ignoreChars, maxIgnoredChars, ignoreWS); break; } } stream.moveToNext(); } return results; } public List<AnnotationFS> find(RutaStream stream, boolean ignoreCase, int size, boolean ignoreWS) { return find(stream, ignoreCase, size, null, 0, ignoreWS); } private void tryToCreateAnnotation(RutaStream stream, boolean ignoreCase, int size, ArrayList<AnnotationFS> results, List<RutaBasic> basicsToAdd, String lastCandidate, Annotation interResult, char[] ignoreChars, int maxIgnoredChars, boolean ignoreWS) { if (basicsToAdd.size() >= 1 && contains(lastCandidate, ignoreCase, size, ignoreChars, maxIgnoredChars, ignoreWS)) { results.add(new Annotation(stream.getJCas(), basicsToAdd.get(0).getBegin(), basicsToAdd.get(basicsToAdd.size() - 1).getEnd())); } else if (interResult != null) { results.add(interResult); } } public void readXML(InputStream stream, String encoding) throws IOException { try { InputStream is = new BufferedInputStream(stream); // adds mark/reset support boolean isXml = MultiTreeWordListPersistence.isSniffedXmlContentType(is); if (!isXml) { // MTWL is encoded is = new ZipInputStream(is); ((ZipInputStream) is).getNextEntry(); // zip must contain a single entry } InputStreamReader streamReader = new InputStreamReader(is, encoding); this.root = new TextNode(); XMLEventHandler handler = new XMLEventHandler(root); SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser parser = factory.newSAXParser(); XMLReader reader = parser.getXMLReader(); // XMLReader reader = XMLReaderFactory.createXMLReader(); reader.setContentHandler(handler); reader.setErrorHandler(handler); reader.parse(new InputSource(streamReader)); } catch (SAXParseException spe) { StringBuffer sb = new StringBuffer(spe.toString()); sb.append("\n Line number: " + spe.getLineNumber()); sb.append("\n Column number: " + spe.getColumnNumber()); sb.append("\n Public ID: " + spe.getPublicId()); sb.append("\n System ID: " + spe.getSystemId() + "\n"); System.out.println(sb.toString()); } catch (SAXException se) { System.out.println("loadDOM threw " + se); se.printStackTrace(System.out); } catch (ParserConfigurationException e) { e.printStackTrace(); } } public void createTWLFile(String path, String encoding) throws IOException { createTWLFile(root, path, true, encoding); } public void createTWLFile(String path, boolean compressed, String encoding) throws IOException { createTWLFile(root, path, compressed, encoding); } public void createTWLFile(TextNode root, String path, boolean compressed, String encoding) throws IOException { if (compressed) { writeCompressedTWLFile(root, path, encoding); } else { writeUncompressedMTWLFile(root, path, encoding); } } private void writeCompressedTWLFile(TextNode root, String path, String encoding) throws IOException { FileOutputStream fos = new FileOutputStream(path); BufferedOutputStream bos = new BufferedOutputStream(fos); ZipOutputStream zos = new ZipOutputStream(bos); OutputStreamWriter writer = new OutputStreamWriter(zos, encoding); zos.putNextEntry(new ZipEntry(path)); writeTWLFile(root, writer); writer.flush(); zos.closeEntry(); writer.close(); } private void writeUncompressedMTWLFile(TextNode root, String path, String encoding) throws IOException { FileOutputStream output = new FileOutputStream(path); OutputStreamWriter writer = new OutputStreamWriter(output, encoding); writeTWLFile(root, writer); writer.close(); } private void writeTWLFile(TextNode root, Writer writer) throws IOException { writer.write("<?xml version=\"1.0\" ?>"); writer.write("<root>"); for (TextNode child : root.getChildren().values()) { writeNode(writer, child); } writer.write("</root>"); } public void writeNode(Writer writer, TextNode node) throws IOException { String output = "<node char=\"" + node.getValue() + "\" isWordEnd=\"" + Boolean.toString(node.isWordEnd()) + "\">"; writer.write(output); for (TextNode child : node.getChildren().values()) { writeNode(writer, child); } writer.write("</node>"); } @Override public String toString() { return name; } public List<AnnotationFS> find(RutaStream stream, Map<String, Object> typeMap, boolean ignoreCase, int ignoreLength, boolean edit, double distance, String ignoreToken) { return null; } public List<String> contains(String string, boolean ignoreCase, int ignoreLength, boolean edit, double distance, String ignoreToken) { return null; } public List<String> containsFragment(String string, boolean ignoreCase, int ignoreLength, boolean edit, double distance, String ignoreToken) { return null; } public void startDocument() { } public void endDocument() { } }