Java tutorial
/* * Copyright 2014 Hugo m09? Mougard. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package eu.crydee.alignment.aligner.cr; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ListMultimap; import eu.crydee.alignment.aligner.ts.Document; import eu.crydee.alignment.aligner.ts.Paragraph; import eu.crydee.alignment.aligner.ts.Sentence; import eu.crydee.alignment.aligner.ts.Token; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; import org.apache.uima.fit.component.ViewCreatorAnnotator; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; /** * * @author Hugo m09? Mougard */ public class BritannicaCR extends JCasCollectionReader_ImplBase { private static final Logger logger = LogManager.getLogger(BritannicaCR.class); public static final String PARAM_BRITANNICA_CORPUS_PATH = "P1"; @ConfigurationParameter(name = PARAM_BRITANNICA_CORPUS_PATH, mandatory = true) private String corpusPath; public static final String PARAM_BRITANNICA_ANNOTATIONS_PATH = "P2"; @ConfigurationParameter(name = PARAM_BRITANNICA_ANNOTATIONS_PATH, mandatory = true) private String annsPath; public static final String PARAM_VIEW_NAME_ELEMENTARY = "P3"; @ConfigurationParameter(name = PARAM_VIEW_NAME_ELEMENTARY, mandatory = true) private String eleName; public static final String PARAM_VIEW_NAME_NORMAL = "P4"; @ConfigurationParameter(name = PARAM_VIEW_NAME_NORMAL, mandatory = true) private String normalName; private String[] filesNames; private File corpus, anns; private int currentIndex, max; private final Pattern adaLine = Pattern.compile("(\\d+) (\\d+) (.*)"); @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); corpus = new File(corpusPath); anns = new File(annsPath); List<String> errs = new ArrayList<>(); if (!corpus.isDirectory()) { errs.add("The Britannica ccorpus folder path doesn't resolve to " + "a folder."); } else if (!corpus.canRead()) { errs.add("The Britannica corpus folder can't be read."); } else if (!anns.isDirectory()) { errs.add("The Britannica annotations folder path doesn't resolve " + "to a folder."); } else if (!anns.canRead()) { errs.add("The Britannica annotations folder can't be read."); } if (!errs.isEmpty()) { logger.error(errs.stream().collect(Collectors.joining("\n"))); throw new ResourceInitializationException(); } Pattern eleAdaFile = Pattern.compile(".*-ele\\.ada"); Set<String> fileNames = new HashSet<>(Arrays.asList(corpus.list())), annsNames = new HashSet<>(Arrays.asList(anns.list())); filesNames = corpus.list( (d, n) -> eleAdaFile.matcher(n).matches() && fileNames.contains(n.replace("-ele.ada", "-bri.ada")) && annsNames.contains(n.replace("-ele.ada", "-hum.txt"))); currentIndex = 0; max = filesNames.length; } @Override public void getNext(JCas jcas) throws IOException, CollectionException { JCas eleV, briV; try { eleV = ViewCreatorAnnotator.createViewSafely(jcas, eleName); briV = ViewCreatorAnnotator.createViewSafely(jcas, normalName); } catch (AnalysisEngineProcessException ex) { throw new CollectionException(ex); } jcas.setDocumentLanguage("en"); eleV.setDocumentLanguage("en"); briV.setDocumentLanguage("en"); String eleFilepath = filesNames[currentIndex], normalFilepath = eleFilepath.replace("-ele.ada", "-bri.ada"), annName = eleFilepath.replace("-ele.ada", "-hum.txt"), name = StringUtils.capitalize(eleFilepath.replace("-ele.ada", "")); File ele = new File(corpus, eleFilepath), bri = new File(corpus, normalFilepath), ann = new File(anns, annName); ListMultimap<Integer, Integer> eleBriGold = ArrayListMultimap.create(), briEleGold = ArrayListMultimap.create(); try (BufferedReader br = new BufferedReader(new FileReader(ann))) { String line; int k = 0; int i = -1; while ((line = br.readLine()) != null) { switch (k % 3) { case 0: i = Integer.parseInt(line.split(" ")[0]); break; case 1: int j = Integer.parseInt(line.split(" ")[0]); eleBriGold.put(i, j); briEleGold.put(j, i); break; case 2: break; } ++k; } } StringBuilder eleSb = new StringBuilder(), normalSb = new StringBuilder(); List<Sentence> eleSents = new ArrayList<>(), briSents = new ArrayList<>(); handleAda(ele, eleSb, eleSents, eleV); handleAda(bri, normalSb, briSents, briV); for (Integer eleIndex : eleBriGold.keySet()) { Sentence eleSent = eleSents.get(eleIndex - 1); List<Integer> briIndeces = eleBriGold.get(eleIndex); eleSent.setGoldSimilarities(new FSArray(eleV, briIndeces.size())); for (int i = 0, l = briIndeces.size(); i < l; ++i) { Sentence briSent = briSents.get(briIndeces.get(i) - 1); eleSent.setGoldSimilarities(i, briSent); } } for (Integer briIndex : briEleGold.keySet()) { Sentence briSent = briSents.get(briIndex - 1); List<Integer> eleIndeces = briEleGold.get(briIndex); briSent.setGoldSimilarities(new FSArray(briV, eleIndeces.size())); for (int i = 0, l = eleIndeces.size(); i < l; ++i) { Sentence eleSent = eleSents.get(eleIndeces.get(i) - 1); briSent.setGoldSimilarities(i, eleSent); } } eleV.setDocumentText(eleSb.toString()); briV.setDocumentText(normalSb.toString()); jcas.setDocumentText(FileUtils.readFileToString(ann)); for (JCas j : new JCas[] { eleV, briV, jcas }) { Document document = new Document(j, 0, j.getDocumentText().length() - 1); document.setName(name); document.addToIndexes(); } ++currentIndex; } @SuppressWarnings("null") private void handleAda(File file, StringBuilder sb, List<Sentence> sentencesList, JCas jcas) throws IOException { try (BufferedReader br = new BufferedReader(new FileReader(file))) { String line; int previousParId = -1; int endOffset = 0; Paragraph currentParagraph = null; while ((line = br.readLine()) != null) { Matcher m = adaLine.matcher(line); if (m.matches()) { int parId = Integer.parseInt(m.group(2)); if (previousParId != parId) { if (parId != 1) { sb.append("\n\n"); endOffset += 2; } currentParagraph = new Paragraph(jcas); currentParagraph.setBegin(endOffset); currentParagraph.addToIndexes(); } else { sb.append(' '); ++endOffset; } Sentence sentence = new Sentence(jcas); sentence.setBegin(endOffset); String[] tokens = m.group(3).split(" "); for (int i = 0, l = tokens.length; i < l; ++i) { if (i != 0) { sb.append(' '); ++endOffset; } Token token = new Token(jcas); token.setLemma(tokens[i].toLowerCase(Locale.ENGLISH)); token.setBegin(endOffset); endOffset += tokens[i].length(); token.setEnd(endOffset); token.addToIndexes(); sb.append(tokens[i]); } sentence.setEnd(endOffset); if (previousParId == parId) { currentParagraph.setEnd(endOffset); } sentence.addToIndexes(); sentencesList.add(sentence); previousParId = parId; } } } } @Override public boolean hasNext() throws IOException, CollectionException { return currentIndex < max; } @Override public Progress[] getProgress() { return new Progress[] { new ProgressImpl(currentIndex, max, Progress.ENTITIES) }; } }