Java tutorial
/* * Copyright 2014 Hugo m09? Mougard. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package eu.crydee.alignment.aligner.cr; import com.google.common.collect.Sets; import eu.crydee.alignment.aligner.ts.Document; import eu.crydee.alignment.aligner.ts.Paragraph; import eu.crydee.alignment.aligner.ts.Sentence; import eu.crydee.alignment.aligner.ts.Token; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.commons.io.FileUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; import org.apache.uima.fit.component.ViewCreatorAnnotator; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; /** * * @author Hugo m09? Mougard */ public class MetricsCR extends JCasCollectionReader_ImplBase { private static final Logger logger = LogManager.getLogger(MetricsCR.class); public static final String PARAM_WHITELIST_FILE_PATH = "P1"; @ConfigurationParameter(name = PARAM_WHITELIST_FILE_PATH, mandatory = true) private String whitelistFilePath; public static final String PARAM_BRITANNICA_CORPUS_PATH = "P2"; @ConfigurationParameter(name = PARAM_BRITANNICA_CORPUS_PATH, mandatory = true) private String corpusPath; public static final String PARAM_VIEW_NAME_ELEMENTARY = "P3"; @ConfigurationParameter(name = PARAM_VIEW_NAME_ELEMENTARY, mandatory = true) private String eleName; public static final String PARAM_VIEW_NAME_NORMAL = "P4"; @ConfigurationParameter(name = PARAM_VIEW_NAME_NORMAL, mandatory = true) private String normalName; private File whitelistFile; private Iterator<List<String>> it; private int currentIndex; private final Pattern adaLine = Pattern.compile("(\\d+) (\\d+) (.*)"); @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); whitelistFile = new File(whitelistFilePath); List<String> errs = new ArrayList<>(); if (!whitelistFile.isFile()) { errs.add("The run file doesn't resolve to a file."); } else if (!whitelistFile.canRead()) { errs.add("The run file can't be read."); } if (!errs.isEmpty()) { logger.error(errs.stream().collect(Collectors.joining("\n"))); throw new ResourceInitializationException(); } try { Set<String> lines = new HashSet<>(FileUtils.readLines(whitelistFile, StandardCharsets.UTF_8)); it = Sets.cartesianProduct(lines, lines).iterator(); } catch (FileNotFoundException e) { logger.error("Couldn't find the run file.", e); throw new ResourceInitializationException(e); } catch (IOException e) { logger.error("Couldn't read the whitelist file.", e); throw new ResourceInitializationException(e); } currentIndex = 0; } @Override public void getNext(JCas jcas) throws IOException, CollectionException { JCas eleV, briV; try { eleV = ViewCreatorAnnotator.createViewSafely(jcas, eleName); briV = ViewCreatorAnnotator.createViewSafely(jcas, normalName); } catch (AnalysisEngineProcessException ex) { throw new CollectionException(ex); } jcas.setDocumentLanguage("en"); eleV.setDocumentLanguage("en"); briV.setDocumentLanguage("en"); List<String> cities = it.next(); String eleFilepath = cities.get(0).trim(), normalFilepath = cities.get(1).trim(), name = eleFilepath + "-" + normalFilepath; logger.info("processing " + name); File ele = new File(corpusPath, eleFilepath + "-ele.ada"), bri = new File(corpusPath, normalFilepath + "-bri.ada"); StringBuilder eleSb = new StringBuilder(), normalSb = new StringBuilder(); handleAda(ele, eleSb, eleV); handleAda(bri, normalSb, briV); eleV.setDocumentText(eleSb.toString()); briV.setDocumentText(normalSb.toString()); jcas.setDocumentText("The default CAS stays empty in this pipeline."); for (JCas j : new JCas[] { eleV, briV, jcas }) { Document document = new Document(j, 0, j.getDocumentText().length() - 1); document.setName(name); document.addToIndexes(); } ++currentIndex; } @SuppressWarnings("null") private void handleAda(File file, StringBuilder sb, JCas jcas) throws IOException { try (BufferedReader br = new BufferedReader(new FileReader(file))) { String line; int previousParId = -1; int endOffset = 0; Paragraph currentParagraph = null; while ((line = br.readLine()) != null) { Matcher m = adaLine.matcher(line); if (m.matches()) { int parId = Integer.parseInt(m.group(2)); if (previousParId != parId) { if (parId != 1) { sb.append("\n\n"); endOffset += 2; } currentParagraph = new Paragraph(jcas); currentParagraph.setBegin(endOffset); currentParagraph.addToIndexes(); } else { sb.append(' '); ++endOffset; } Sentence sentence = new Sentence(jcas); sentence.setBegin(endOffset); String[] tokens = m.group(3).split(" "); for (int i = 0, l = tokens.length; i < l; ++i) { if (i != 0) { sb.append(' '); ++endOffset; } Token token = new Token(jcas); token.setLemma(tokens[i].toLowerCase(Locale.ENGLISH)); token.setBegin(endOffset); endOffset += tokens[i].length(); token.setEnd(endOffset); token.addToIndexes(); sb.append(tokens[i]); } sentence.setEnd(endOffset); if (previousParId == parId) { currentParagraph.setEnd(endOffset); } sentence.addToIndexes(); previousParId = parId; } } } } @Override public boolean hasNext() throws IOException, CollectionException { return it.hasNext(); } @Override public Progress[] getProgress() { return new Progress[] { new ProgressImpl(currentIndex, -1, Progress.ENTITIES) }; } }