Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.uima.ruta.engine; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.commons.lang3.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.Feature; import org.apache.uima.cas.Type; import org.apache.uima.cas.TypeSystem; import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.ruta.UIMAConstants; import org.apache.uima.util.CasCopier; import org.apache.uima.util.Level; import org.htmlparser.Parser; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * This Analysis Engine is able to convert html content from a source view into a plain string * representation stored in an output view. Especially, the Analysis Engine transfers annotations * under consideration of the changed document text and annotation offsets in the new view. The copy * process also sets features, however, features of type annotation are currently not supported. * Note that if an annotation would have the same start and end positions in the new view, i.e., if * it would be mapped to an annotation of length 0, it is not moved to the new view. * * The HTML Converter also supports heuristic and explicit conversion patterns which default to * html4 decoding, e.g., "{@literal }", "{@literal <}", etc. Concepts like tables or * lists are not supported. * * Note that in general it is suggested to run an html cleaner before any further processing to * avoid problems with malformed html. * * A descriptor file for this Analysis Engine is located in the folder <code>descriptor/utils</code> * of a UIMA Ruta project. * */ public class HtmlConverter extends JCasAnnotator_ImplBase { public static final String NAMESPACE = "org.apache.uima.ruta.type.html."; public static final String DEFAULT_MODIFIED_VIEW = "plaintext"; public static final String LINEBREAK = "\n"; /** * This string parameter specifies the name of the new view. The default value is * <code>plaintext</code>. */ public static final String PARAM_OUTPUT_VIEW = "outputView"; @ConfigurationParameter(name = PARAM_OUTPUT_VIEW, mandatory = false, defaultValue = DEFAULT_MODIFIED_VIEW) private String modifiedViewName; /** * This string parameter can optionally be set to specify the name of the input view. */ public static final String PARAM_INPUT_VIEW = "inputView"; @ConfigurationParameter(name = PARAM_INPUT_VIEW, mandatory = false) private String inputViewName; /** * This boolean parameter determines if linebreaks inside the text nodes are kept or removed. The * default behavior is <code>true</code>. */ public static final String PARAM_REPLACE_LINEBREAKS = "replaceLinebreaks"; @ConfigurationParameter(name = PARAM_REPLACE_LINEBREAKS, mandatory = false, defaultValue = "true") private Boolean replaceLinebreaks; /** * This boolean parameter determines if the converter should skip whitespaces. Html documents * often contains whitespaces for indentation and formatting, which should not be reproduced in * the converted plain text document. If the parameter is set to false, then the whitespces are * not removed. This behavior is useful, if not Html documents are converted, but XMl files. The * default value is true. */ public static final String PARAM_SKIP_WHITESPACES = "skipWhitespaces"; @ConfigurationParameter(name = PARAM_SKIP_WHITESPACES, mandatory = false, defaultValue = "true") private Boolean skipWhitespaces; /** * If this boolean parameter is set to true, then the tags of the complete document is processed * and not only those within the body tag. */ public static final String PARAM_PROCESS_ALL = "processAll"; @ConfigurationParameter(name = PARAM_PROCESS_ALL, mandatory = false, defaultValue = "false") private Boolean processAll; /** * If this boolean parameter is set to true, then zero-length annotation will not be dropped, but * they will be assigned to the offset of the "nearest" annotation. In that case, a boolean * feature names offsetsExpanded will be set to true if available. */ public static final String PARAM_EXPAND_OFFSETS = "expandOffsets"; @ConfigurationParameter(name = PARAM_EXPAND_OFFSETS, mandatory = false, defaultValue = "false") private Boolean expandOffsets; /** * This string parameter determines the character sequence that replaces a linebreak. The default * behavior is the empty string. */ public static final String PARAM_LINEBREAK_REPLACEMENT = "linebreakReplacement"; @ConfigurationParameter(name = PARAM_LINEBREAK_REPLACEMENT, mandatory = false, defaultValue = "") private String linebreakReplacement; /** * This string array parameter sets the names of the html tags that create linebreaks in the * output view. The default is <code>br, p, div, ul, ol, dl, li, h1, ..., h6, blockquote</code>. */ public static final String PARAM_NEWLINE_INDUCING_TAGS = "newlineInducingTags"; @ConfigurationParameter(name = PARAM_NEWLINE_INDUCING_TAGS, mandatory = false, defaultValue = { "br", "p", "div", "ul", "ol", "dl", "li", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote" }) private String[] newlineInducingTags; /** * This string parameter contains a regular expression for HTML/XML elements. If the pattern * matches, then the element will introduce a new line break similar to the element of the * parameter <code>newlineInducingTags</code>. */ public static final String PARAM_NEWLINE_INDUCING_TAG_REGEXP = "newlineInducingTagRegExp"; @ConfigurationParameter(name = PARAM_NEWLINE_INDUCING_TAG_REGEXP, mandatory = false) private String newlineInducingTagRegExp; /** * This string array parameter sets the names of the html tags that create additional text in the * output view. The acutal string of the gap is defined by the parameter <code>gapText</code>. */ public static final String PARAM_GAP_INDUCING_TAGS = "gapInducingTags"; @ConfigurationParameter(name = PARAM_GAP_INDUCING_TAGS, mandatory = false) private String[] gapInducingTags; /** * This string parameter determines the character sequence that is introduced by the html tags * specified in the <code>gapInducingTags</code>. */ public static final String PARAM_GAP_TEXT = "gapText"; @ConfigurationParameter(name = PARAM_GAP_TEXT, mandatory = false, defaultValue = "") private String gapText; /** * This boolean parameter sets the value of the parameter <code>gapText</code> to a single space. */ public static final String PARAM_USE_SPACE_GAP = "useSpaceGap"; @ConfigurationParameter(name = PARAM_USE_SPACE_GAP, mandatory = false, defaultValue = "") private Boolean useSpaceGap; /** * This string array parameter can be used to apply custom conversions. It defaults to a list of * commonly used codes, e.g., {@literal }, which are converted using html 4 entity * unescaping. However, explicit conversion strings can also be passed via the parameter * <code>conversionReplacements</code>. Remember to enable explicit conversion via * <code>conversionPolicy</code> first. */ public static final String PARAM_CONVERSION_PATTERNS = "conversionPatterns"; @ConfigurationParameter(name = PARAM_CONVERSION_PATTERNS, mandatory = false, defaultValue = { " ", "«", "»", """, "&", "<", ">", "'", "§", "¨", "©", "™", "®", "ö", "ä", "ü", " " }) private String[] conversionPatterns; /** * This string parameter determines the conversion policy used, either "heuristic", "explicit", or * "none". When the value is "explicit", the parameters <code>conversionPatterns</code> and * optionally <code>conversionReplacements</code> are considered. The "heuristic" conversion * policy uses simple regular expressions to decode html4 entities such as "{@literal }". * The default behavior is "heuristic". */ public static final String PARAM_CONVERSION_POLICY = "conversionPolicy"; @ConfigurationParameter(name = PARAM_CONVERSION_POLICY, mandatory = false, defaultValue = "heuristic") private String conversionPolicy; /** * This string array parameter corresponds to <code>conversionPatterns</code> such that * <code>conversionPatterns[i]</code> will be replaced by <code>conversionReplacements[i]</code>; * replacements should be shorter than the source pattern. Per default, the replacement strings * are computed using Html4 decoding. Remember to enable explicit conversion via * <code>conversionPolicy</code> first. */ public static final String PARAM_CONVERSION_REPLACEMENTS = "conversionReplacements"; @ConfigurationParameter(name = PARAM_CONVERSION_REPLACEMENTS, mandatory = false) private String[] conversionReplacements; private int[] map; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); inputViewName = (String) aContext.getConfigParameterValue(PARAM_INPUT_VIEW); inputViewName = StringUtils.isBlank(inputViewName) ? null : inputViewName; modifiedViewName = (String) aContext.getConfigParameterValue(PARAM_OUTPUT_VIEW); modifiedViewName = StringUtils.isBlank(modifiedViewName) ? DEFAULT_MODIFIED_VIEW : modifiedViewName; replaceLinebreaks = (Boolean) aContext.getConfigParameterValue(PARAM_REPLACE_LINEBREAKS); replaceLinebreaks = replaceLinebreaks == null ? true : replaceLinebreaks; skipWhitespaces = (Boolean) aContext.getConfigParameterValue(PARAM_SKIP_WHITESPACES); skipWhitespaces = skipWhitespaces == null ? true : skipWhitespaces; processAll = (Boolean) aContext.getConfigParameterValue(PARAM_PROCESS_ALL); processAll = processAll == null ? true : processAll; linebreakReplacement = (String) aContext.getConfigParameterValue(PARAM_LINEBREAK_REPLACEMENT); linebreakReplacement = linebreakReplacement == null ? "" : linebreakReplacement; String conversionPolicy = (String) aContext.getConfigParameterValue(PARAM_CONVERSION_POLICY); if (StringUtils.isBlank(conversionPolicy) || conversionPolicy.equals("heuristic")) { conversionPolicy = "heuristic"; } else if (conversionPolicy.equals("explicit")) { } else if (conversionPolicy.equals("none")) { } else { throw new ResourceInitializationException("illegal conversionPolicy parameter value", new Object[0]); } String[] nlTags = (String[]) aContext.getConfigParameterValue(PARAM_NEWLINE_INDUCING_TAGS); if (nlTags == null) { newlineInducingTags = new String[] { "br", "p", "div", "ul", "ol", "dl", "li", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote" }; } // check assertions if (modifiedViewName.equals(inputViewName)) { throw new ResourceInitializationException("input and output view names must differ!", new Object[0]); } conversionPatterns = (String[]) aContext.getConfigParameterValue(PARAM_CONVERSION_PATTERNS); if (conversionPatterns == null) { conversionPatterns = new String[] { " ", "«", "»", """, "&", "<", ">", "'", "§", "¨", "©", "™", "®", "ö", "ä", "ü", " " }; } conversionReplacements = (String[]) aContext.getConfigParameterValue(PARAM_CONVERSION_REPLACEMENTS); if (conversionReplacements == null) { conversionReplacements = new String[conversionPatterns.length]; for (int i = 0; i < conversionPatterns.length; i++) { String c = conversionPatterns[i]; String rep = StringEscapeUtils.unescapeHtml4(c); conversionReplacements[i] = rep; } } gapText = (String) aContext.getConfigParameterValue(PARAM_GAP_TEXT); gapText = gapText == null ? "" : gapText; useSpaceGap = (Boolean) aContext.getConfigParameterValue(PARAM_USE_SPACE_GAP); useSpaceGap = useSpaceGap == null ? false : useSpaceGap; if (useSpaceGap) { gapText = " "; } gapInducingTags = (String[]) aContext.getConfigParameterValue(PARAM_GAP_INDUCING_TAGS); gapInducingTags = gapInducingTags == null ? new String[0] : gapInducingTags; expandOffsets = (Boolean) aContext.getConfigParameterValue(PARAM_EXPAND_OFFSETS); expandOffsets = expandOffsets == null ? false : expandOffsets; newlineInducingTagRegExp = (String) aContext.getConfigParameterValue(PARAM_NEWLINE_INDUCING_TAG_REGEXP); } @Override public void process(JCas jcaz) throws AnalysisEngineProcessException { JCas jcas; try { if (inputViewName != null) { jcas = jcaz.getView(inputViewName); } else { jcas = jcaz; } } catch (CASException e1) { throw new AnalysisEngineProcessException(e1.getCause()); } // init: String documentText = jcas.getDocumentText(); String splitSeq = documentText.contains("\r\n") ? "\r\n" : "\n"; map = new int[documentText.length() + 1]; JCas modview = null; try { // check if view already exists: Iterator<JCas> viewIterator = jcas.getViewIterator(); while (viewIterator.hasNext()) { JCas jCas2 = (JCas) viewIterator.next(); if (jCas2.getViewName().equals(modifiedViewName)) { modview = jCas2; getContext().getLogger().log(Level.WARNING, "view with name \"" + modifiedViewName + "\" already exists."); } } if (modview == null) { modview = jcas.createView(modifiedViewName); } } catch (CASException e) { e.printStackTrace(); return; } SortedSet<HtmlConverterPSpan> visibleSpansSoFar = new TreeSet<HtmlConverterPSpan>(); SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>(); SortedSet<HtmlConverterPSpan> gapsFromHtmlTags = new TreeSet<HtmlConverterPSpan>(); // process try { Parser parser = new Parser(documentText); NodeList list = parser.parse(null); HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, newlineInducingTagRegExp, gapInducingTags, gapText, skipWhitespaces, processAll); list.visitAllNodesWith(visitor); visibleSpansSoFar = visitor.getTextSpans(); linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags(); gapsFromHtmlTags = visitor.getGapsFromHtmlTags(); } catch (ParserException e) { throw new AnalysisEngineProcessException(e); } if (replaceLinebreaks) { visibleSpansSoFar = this.handleLinebreaksInDocumentText(visibleSpansSoFar, splitSeq); } if (conversionPolicy.equals("heuristic")) { visibleSpansSoFar = this.htmlDecoding(visibleSpansSoFar); } else if (conversionPolicy.equals("explicit")) { for (int i = 0; i < conversionPatterns.length; i++) { String pat = conversionPatterns[i]; String rep = conversionReplacements[i]; visibleSpansSoFar = this.handleConversion(visibleSpansSoFar, pat, rep); } } visibleSpansSoFar.addAll(linebreaksFromHtmlTags); visibleSpansSoFar.addAll(gapsFromHtmlTags); // create new doc-text and the map from deletions and visible-text-spans: StringBuffer sbu = new StringBuffer(documentText.length()); int originalOffsetI = 0; int outOffset = 0; for (HtmlConverterPSpan vis : visibleSpansSoFar) { final int begin = vis.getBegin(); final int end = vis.getEnd(); // map text before annotation: while (originalOffsetI < begin) { map[originalOffsetI++] = outOffset; } // get and map text/replacement: String s = ""; if (vis instanceof HtmlConverterPSpanReplacement) { // conversion/replacement: s = vis.getTxt(); // asserts that s is shorter than the original source while (originalOffsetI < begin + s.length()) { map[originalOffsetI++] = outOffset++; } while (originalOffsetI < end) { map[originalOffsetI++] = outOffset; } } else { // simple annotation: s = documentText.substring(begin, end); while (originalOffsetI < end) { map[originalOffsetI++] = outOffset++; } } sbu.append(s); } while (originalOffsetI < documentText.length()) { map[originalOffsetI++] = outOffset; } map[documentText.length()] = outOffset + 1; // handle doc end separately String modTxt = sbu.toString(); modview.setDocumentText(modTxt); // copy annotations using the 'map': try { mapAnnotations(jcas, map, modifiedViewName); } catch (CASException e) { e.printStackTrace(); } } private void mapAnnotations(JCas fromJcas, int[] map, String toView) throws CASException { JCas modview = fromJcas.getView(toView); Set<Annotation> indexedFs = new HashSet<Annotation>(); Set<Annotation> toExpand = new HashSet<Annotation>(); AnnotationIndex<Annotation> annotationIndex = fromJcas.getAnnotationIndex(); TypeSystem typeSystem = fromJcas.getTypeSystem(); Type docType = typeSystem.getType(UIMAConstants.TYPE_DOCUMENT); CasCopier casCopier = new CasCopier(fromJcas.getCas(), modview.getCas()); for (Annotation annotation : annotationIndex) { // TODO be careful here, because some people inherit from DocumentAnnotation if (typeSystem.subsumes(docType, annotation.getType())) { continue; } Annotation clone = (Annotation) casCopier.copyFs(annotation); // change the view/sofa of the new annotation... // see: http://osdir.com/ml/apache.uima.general/2007-09/msg00107.html clone.setFeatureValue(modview.getTypeSystem().getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA), modview.getSofa()); final int mappedBegin = map[clone.getBegin()]; final int mappedEnd = map[clone.getEnd()]; if (mappedBegin < mappedEnd) { if (mappedEnd > fromJcas.getCas().getDocumentAnnotation().getEnd()) { getContext().getLogger().log(Level.WARNING, "illegal annotation offset mapping"); } else { int max = modview.getCas().getDocumentAnnotation().getEnd(); if (mappedBegin < max && mappedEnd <= max && mappedBegin >= 0 && mappedEnd > 0) { clone.setBegin(mappedBegin); clone.setEnd(mappedEnd); // TODO handle nested annotation features modview.addFsToIndexes(clone); indexedFs.add(clone); } else { getContext().getLogger().log(Level.WARNING, "illegal annotation offset mapping"); } } } else if (expandOffsets) { clone.setBegin(mappedBegin); clone.setEnd(mappedEnd); toExpand.add(clone); } } for (Annotation each : toExpand) { Annotation nextBestAnnotation = getNextBestAnnotation(each, modview); if (nextBestAnnotation != null) { each.setBegin(nextBestAnnotation.getBegin()); each.setEnd(nextBestAnnotation.getEnd()); Feature expandedOffsetsFeature = each.getType().getFeatureByBaseName("expandedOffsets"); if (expandedOffsetsFeature != null) { each.setBooleanValue(expandedOffsetsFeature, true); } modview.addFsToIndexes(each); } } } private Annotation getNextBestAnnotation(Annotation source, JCas jcas) { FSIterator<Annotation> iterator = jcas.getAnnotationIndex().iterator(source); Annotation best = null; if (iterator.isValid()) { Annotation annotation = iterator.get(); best = annotation; } else { Annotation dummy = new Annotation(jcas, source.getBegin(), source.getBegin() + 1); iterator = jcas.getAnnotationIndex().iterator(dummy); if (!iterator.isValid()) { if ((jcas.getDocumentText().length() / 2) > source.getBegin()) { iterator.moveToFirst(); if (iterator.isValid()) { Annotation annotation = iterator.get(); best = annotation; } } else { iterator.moveToLast(); if (iterator.isValid()) { Annotation annotation = iterator.get(); best = annotation; } } } } return best; } private SortedSet<HtmlConverterPSpan> handleLinebreaksInDocumentText( SortedSet<HtmlConverterPSpan> visibleSpansSoFar, String splitSeq) { return this.handleConversion(visibleSpansSoFar, splitSeq, linebreakReplacement); } private SortedSet<HtmlConverterPSpan> htmlDecoding(SortedSet<HtmlConverterPSpan> visibleSpansSoFar) { TreeSet<HtmlConverterPSpan> copy = new TreeSet<HtmlConverterPSpan>(visibleSpansSoFar); Pattern patt = Pattern.compile("(&[a-zA-Z0-9]{2,6};)|(&#\\d{2,5};)"); for (HtmlConverterPSpan pSpan : visibleSpansSoFar) { String spanTxt = pSpan.getTxt(); Matcher matcher = patt.matcher(spanTxt); if (matcher.find()) { copy.remove(pSpan); int pSpanBegin = pSpan.getBegin(); int ioff = pSpan.getBegin(); do { String sourceString = matcher.group(); String replacement = StringEscapeUtils.unescapeHtml4(sourceString); HtmlConverterPSpanReplacement replacementSpan = new HtmlConverterPSpanReplacement( pSpanBegin + matcher.start(), pSpanBegin + matcher.end(), replacement); copy.add(replacementSpan); int replacementLength = sourceString.length(); if (pSpanBegin + matcher.end() > ioff + replacementLength) { int ib = ioff; int ie = pSpanBegin + matcher.start(); String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin); copy.add(new HtmlConverterPSpan(ib, ie, newTxt)); ioff = ie; } ioff += replacementLength; // } while (matcher.find()); if (ioff < pSpan.getEnd()) { int ib = ioff; int ie = pSpan.getEnd(); String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin); copy.add(new HtmlConverterPSpan(ioff, pSpan.getEnd(), newTxt)); } } } return copy; } private SortedSet<HtmlConverterPSpan> handleConversion(SortedSet<HtmlConverterPSpan> visibleSpansSoFar, String patternString, String replacement) { TreeSet<HtmlConverterPSpan> copy = new TreeSet<HtmlConverterPSpan>(visibleSpansSoFar); Pattern patt = Pattern.compile(patternString); int replacementLength = patternString.length(); for (HtmlConverterPSpan pSpan : visibleSpansSoFar) { String spanTxt = pSpan.getTxt(); Matcher matcher = patt.matcher(spanTxt); if (matcher.find()) { copy.remove(pSpan); int pSpanBegin = pSpan.getBegin(); int ioff = pSpan.getBegin(); do { if (!StringUtils.isEmpty(replacement)) { HtmlConverterPSpanReplacement replacementSpan = new HtmlConverterPSpanReplacement( pSpanBegin + matcher.start(), pSpanBegin + matcher.end(), replacement); copy.add(replacementSpan); } if (pSpanBegin + matcher.end() > ioff + replacementLength) { int ib = ioff; int ie = pSpanBegin + matcher.start(); String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin); copy.add(new HtmlConverterPSpan(ib, ie, newTxt)); ioff = ie; } ioff += replacementLength; // } while (matcher.find()); if (ioff < pSpan.getEnd()) { int ib = ioff; int ie = pSpan.getEnd(); String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin); copy.add(new HtmlConverterPSpan(ioff, pSpan.getEnd(), newTxt)); } } } return copy; } }