eu.crydee.alignment.aligner.cr.BritannicaCR.java Source code

Java tutorial

Introduction

Here is the source code for eu.crydee.alignment.aligner.cr.BritannicaCR.java

Source

/*
 * Copyright 2014 Hugo m09? Mougard.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.crydee.alignment.aligner.cr;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import eu.crydee.alignment.aligner.ts.Document;
import eu.crydee.alignment.aligner.ts.Paragraph;
import eu.crydee.alignment.aligner.ts.Sentence;
import eu.crydee.alignment.aligner.ts.Token;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.component.ViewCreatorAnnotator;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;

/**
 *
 * @author Hugo m09? Mougard
 */
public class BritannicaCR extends JCasCollectionReader_ImplBase {

    private static final Logger logger = LogManager.getLogger(BritannicaCR.class);

    public static final String PARAM_BRITANNICA_CORPUS_PATH = "P1";
    @ConfigurationParameter(name = PARAM_BRITANNICA_CORPUS_PATH, mandatory = true)
    private String corpusPath;

    public static final String PARAM_BRITANNICA_ANNOTATIONS_PATH = "P2";
    @ConfigurationParameter(name = PARAM_BRITANNICA_ANNOTATIONS_PATH, mandatory = true)
    private String annsPath;

    public static final String PARAM_VIEW_NAME_ELEMENTARY = "P3";
    @ConfigurationParameter(name = PARAM_VIEW_NAME_ELEMENTARY, mandatory = true)
    private String eleName;

    public static final String PARAM_VIEW_NAME_NORMAL = "P4";
    @ConfigurationParameter(name = PARAM_VIEW_NAME_NORMAL, mandatory = true)
    private String normalName;

    private String[] filesNames;

    private File corpus, anns;

    private int currentIndex, max;

    private final Pattern adaLine = Pattern.compile("(\\d+) (\\d+) (.*)");

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        corpus = new File(corpusPath);
        anns = new File(annsPath);
        List<String> errs = new ArrayList<>();
        if (!corpus.isDirectory()) {
            errs.add("The Britannica ccorpus folder path doesn't resolve to " + "a folder.");
        } else if (!corpus.canRead()) {
            errs.add("The Britannica corpus folder can't be read.");
        } else if (!anns.isDirectory()) {
            errs.add("The Britannica annotations folder path doesn't resolve " + "to a folder.");
        } else if (!anns.canRead()) {
            errs.add("The Britannica annotations folder can't be read.");
        }
        if (!errs.isEmpty()) {
            logger.error(errs.stream().collect(Collectors.joining("\n")));
            throw new ResourceInitializationException();
        }
        Pattern eleAdaFile = Pattern.compile(".*-ele\\.ada");
        Set<String> fileNames = new HashSet<>(Arrays.asList(corpus.list())),
                annsNames = new HashSet<>(Arrays.asList(anns.list()));
        filesNames = corpus.list(
                (d, n) -> eleAdaFile.matcher(n).matches() && fileNames.contains(n.replace("-ele.ada", "-bri.ada"))
                        && annsNames.contains(n.replace("-ele.ada", "-hum.txt")));
        currentIndex = 0;
        max = filesNames.length;
    }

    @Override
    public void getNext(JCas jcas) throws IOException, CollectionException {
        JCas eleV, briV;
        try {
            eleV = ViewCreatorAnnotator.createViewSafely(jcas, eleName);
            briV = ViewCreatorAnnotator.createViewSafely(jcas, normalName);
        } catch (AnalysisEngineProcessException ex) {
            throw new CollectionException(ex);
        }
        jcas.setDocumentLanguage("en");
        eleV.setDocumentLanguage("en");
        briV.setDocumentLanguage("en");
        String eleFilepath = filesNames[currentIndex], normalFilepath = eleFilepath.replace("-ele.ada", "-bri.ada"),
                annName = eleFilepath.replace("-ele.ada", "-hum.txt"),
                name = StringUtils.capitalize(eleFilepath.replace("-ele.ada", ""));
        File ele = new File(corpus, eleFilepath), bri = new File(corpus, normalFilepath),
                ann = new File(anns, annName);
        ListMultimap<Integer, Integer> eleBriGold = ArrayListMultimap.create(),
                briEleGold = ArrayListMultimap.create();
        try (BufferedReader br = new BufferedReader(new FileReader(ann))) {
            String line;
            int k = 0;
            int i = -1;
            while ((line = br.readLine()) != null) {
                switch (k % 3) {
                case 0:
                    i = Integer.parseInt(line.split(" ")[0]);
                    break;
                case 1:
                    int j = Integer.parseInt(line.split(" ")[0]);
                    eleBriGold.put(i, j);
                    briEleGold.put(j, i);
                    break;
                case 2:
                    break;
                }
                ++k;
            }
        }
        StringBuilder eleSb = new StringBuilder(), normalSb = new StringBuilder();
        List<Sentence> eleSents = new ArrayList<>(), briSents = new ArrayList<>();
        handleAda(ele, eleSb, eleSents, eleV);
        handleAda(bri, normalSb, briSents, briV);
        for (Integer eleIndex : eleBriGold.keySet()) {
            Sentence eleSent = eleSents.get(eleIndex - 1);
            List<Integer> briIndeces = eleBriGold.get(eleIndex);
            eleSent.setGoldSimilarities(new FSArray(eleV, briIndeces.size()));
            for (int i = 0, l = briIndeces.size(); i < l; ++i) {
                Sentence briSent = briSents.get(briIndeces.get(i) - 1);
                eleSent.setGoldSimilarities(i, briSent);
            }
        }
        for (Integer briIndex : briEleGold.keySet()) {
            Sentence briSent = briSents.get(briIndex - 1);
            List<Integer> eleIndeces = briEleGold.get(briIndex);
            briSent.setGoldSimilarities(new FSArray(briV, eleIndeces.size()));
            for (int i = 0, l = eleIndeces.size(); i < l; ++i) {
                Sentence eleSent = eleSents.get(eleIndeces.get(i) - 1);
                briSent.setGoldSimilarities(i, eleSent);
            }
        }
        eleV.setDocumentText(eleSb.toString());
        briV.setDocumentText(normalSb.toString());
        jcas.setDocumentText(FileUtils.readFileToString(ann));
        for (JCas j : new JCas[] { eleV, briV, jcas }) {
            Document document = new Document(j, 0, j.getDocumentText().length() - 1);
            document.setName(name);
            document.addToIndexes();
        }
        ++currentIndex;
    }

    @SuppressWarnings("null")
    private void handleAda(File file, StringBuilder sb, List<Sentence> sentencesList, JCas jcas)
            throws IOException {
        try (BufferedReader br = new BufferedReader(new FileReader(file))) {
            String line;
            int previousParId = -1;
            int endOffset = 0;
            Paragraph currentParagraph = null;
            while ((line = br.readLine()) != null) {
                Matcher m = adaLine.matcher(line);
                if (m.matches()) {
                    int parId = Integer.parseInt(m.group(2));
                    if (previousParId != parId) {
                        if (parId != 1) {
                            sb.append("\n\n");
                            endOffset += 2;
                        }
                        currentParagraph = new Paragraph(jcas);
                        currentParagraph.setBegin(endOffset);
                        currentParagraph.addToIndexes();
                    } else {
                        sb.append(' ');
                        ++endOffset;
                    }
                    Sentence sentence = new Sentence(jcas);
                    sentence.setBegin(endOffset);
                    String[] tokens = m.group(3).split(" ");
                    for (int i = 0, l = tokens.length; i < l; ++i) {
                        if (i != 0) {
                            sb.append(' ');
                            ++endOffset;
                        }
                        Token token = new Token(jcas);
                        token.setLemma(tokens[i].toLowerCase(Locale.ENGLISH));
                        token.setBegin(endOffset);
                        endOffset += tokens[i].length();
                        token.setEnd(endOffset);
                        token.addToIndexes();
                        sb.append(tokens[i]);
                    }
                    sentence.setEnd(endOffset);
                    if (previousParId == parId) {
                        currentParagraph.setEnd(endOffset);
                    }
                    sentence.addToIndexes();
                    sentencesList.add(sentence);
                    previousParId = parId;
                }
            }
        }
    }

    @Override
    public boolean hasNext() throws IOException, CollectionException {
        return currentIndex < max;
    }

    @Override
    public Progress[] getProgress() {
        return new Progress[] { new ProgressImpl(currentIndex, max, Progress.ENTITIES) };
    }

}