Java tutorial
/******************************************************************************* * This file is part of the Coporate Semantic Web Project. * * This work has been partially supported by the ``InnoProfile-Corporate Semantic Web" project funded by the German Federal * Ministry of Education and Research (BMBF) and the BMBF Innovation Initiative for the New German Laender - Entrepreneurial Regions. * * http://www.corporate-semantic-web.de/ * * Freie Universitaet Berlin * Copyright (c) 2007-2013 * * Institut fuer Informatik * Working Group Coporate Semantic Web * Koenigin-Luise-Strasse 24-26 * 14195 Berlin * * http://www.mi.fu-berlin.de/en/inf/groups/ag-csw/ * * This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. * You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA or see <http://www.gnu.org/licenses/> ******************************************************************************/ package de.csw.ontology; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.de.CSWGermanAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.jfree.util.Log; import de.csw.lucene.ConceptFilter; import de.csw.util.Config; import de.csw.util.URLEncoder; /** * Uses background knowledge to enhance the text. * * @author rheese * */ public class XWikiTextEnhancer implements TextEnhancer { static final Logger log = Logger.getLogger(XWikiTextEnhancer.class); static final int MAX_SIMILAR_CONCEPTS = Config.getIntAppProperty(Config.LUCENE_MAXSEARCHTERMS); static final String LUCENE_URL = Config.getAppProperty(Config.LUCENE_URL); OntologyIndex index; /** index for storing the positions of links in a text (start position, end position) */ TreeMap<Integer, Integer> linkIndex = new TreeMap<Integer, Integer>(); public XWikiTextEnhancer() { index = OntologyIndex.get(); } /** * The enhanced text contains links to the Lucene search page of the xWiki * system. The search terms are related to the annotated phrase. */ public String enhance(String text) { CSWGermanAnalyzer ga = new CSWGermanAnalyzer(); TokenStream ts = null; StringBuilder result = new StringBuilder(); initializeLinkIndex(text); try { Reader r = new BufferedReader(new StringReader(text)); ts = ga.tokenStream("", r); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAttribute = ts.addAttribute(TypeAttribute.class); String term; int lastEndIndex = 0; while (ts.incrementToken()) { result.append(text.substring(lastEndIndex, offsetAttribute.startOffset())); term = String.copyValueOf(charTermAttribute.buffer(), 0, charTermAttribute.length()); if (typeAttribute.type().equals(ConceptFilter.CONCEPT_TYPE) && isAnnotatable(offsetAttribute)) { log.debug("Annotating concept: " + term); annotateWithSearch(result, text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset()), term); } else { result.append(text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset())); } lastEndIndex = offsetAttribute.endOffset(); } result.append(text.subSequence(lastEndIndex, text.length())); } catch (IOException e) { Log.error("Error while processing the page content", e); } ga.close(); return result.toString(); } private static final Pattern[] EXCLUDE_FROM_ENHANCEMENTS = { Pattern.compile("\\[\\[[^\\]]*\\]\\]"), Pattern.compile("<csw:linkset.*?>.*?</csw:linkset>"), Pattern.compile("\\{\\{(velocity|groovy|html).*?\\}\\}.*?\\{\\{/\\1\\}\\}", Pattern.DOTALL) }; /** * Extract from text all phrases that are enclosed by '[' and ']' denoting * an xWiki link. * * @param text * text to parse */ protected void initializeLinkIndex(String text) { if (text == null) throw new NullPointerException("Parameter text must not be null"); linkIndex.clear(); if (text.isEmpty()) return; for (Pattern pattern : EXCLUDE_FROM_ENHANCEMENTS) { Matcher matcher = pattern.matcher(text); while (matcher.find()) { linkIndex.put(matcher.start(), matcher.end()); } } } /** * Test if a token can be annotated by the {@link TextEnhancer}, e.g., if it * is not inside an exclude range (e.g. a wiki link). * * @param offsetAttribute * the offset of the token into the text. * @return true iff the token can be annotated */ protected boolean isAnnotatable(OffsetAttribute offsetAttribute) { final int tokenStart = offsetAttribute.startOffset(); Entry<Integer, Integer> containingRange = linkIndex.floorEntry(tokenStart); while (containingRange != null) { if (containingRange.getValue() >= tokenStart) { return false; } containingRange = linkIndex.lowerEntry(containingRange.getKey()); } return true; } /** * Annotates the term by linking <code>term</code> to the search page of the * wiki. * * @param sb * the string builder the result is appended to * @param term * a term * @param stemBase * the base form of the term */ protected void annotateWithSearch(StringBuilder sb, String term, String stemBase) { List<String> matches = index.getSimilarMatchLabels(term, MAX_SIMILAR_CONCEPTS); if (matches.isEmpty()) return; sb.append("[[").append(term); sb.append(">>").append(getSearchURL(matches)); sb.append("||class=\"similarconcept\""); Iterator<String> it = matches.listIterator(); sb.append(" title=\"Suche nach den verwandten Begriffen: "); boolean afterFirstTerm = false; while (it.hasNext()) { String similarTerm = it.next(); if (!stemBase.equals(this.index.getStemmer().stem(similarTerm))) { if (afterFirstTerm) { sb.append(", "); } sb.append(similarTerm); afterFirstTerm = true; } } sb.append("\"]]"); } /** * Creates a link to the search wiki page. * * @param terms * a collection of search terms * @return the link */ protected String getSearchURL(Collection<String> terms) { log.debug("** search terms: " + terms); return LUCENE_URL + "?text=" + URLEncoder.encode(StringUtils.join(terms, ' ')); } }