eu.crydee.alignment.aligner.cr.MetricsCR.java Source code

Java tutorial

Introduction

Here is the source code for eu.crydee.alignment.aligner.cr.MetricsCR.java

Source

/*
 * Copyright 2014 Hugo m09? Mougard.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.crydee.alignment.aligner.cr;

import com.google.common.collect.Sets;
import eu.crydee.alignment.aligner.ts.Document;
import eu.crydee.alignment.aligner.ts.Paragraph;
import eu.crydee.alignment.aligner.ts.Sentence;
import eu.crydee.alignment.aligner.ts.Token;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.component.ViewCreatorAnnotator;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;

/**
 *
 * @author Hugo m09? Mougard
 */
public class MetricsCR extends JCasCollectionReader_ImplBase {

    private static final Logger logger = LogManager.getLogger(MetricsCR.class);

    public static final String PARAM_WHITELIST_FILE_PATH = "P1";
    @ConfigurationParameter(name = PARAM_WHITELIST_FILE_PATH, mandatory = true)
    private String whitelistFilePath;

    public static final String PARAM_BRITANNICA_CORPUS_PATH = "P2";
    @ConfigurationParameter(name = PARAM_BRITANNICA_CORPUS_PATH, mandatory = true)
    private String corpusPath;

    public static final String PARAM_VIEW_NAME_ELEMENTARY = "P3";
    @ConfigurationParameter(name = PARAM_VIEW_NAME_ELEMENTARY, mandatory = true)
    private String eleName;

    public static final String PARAM_VIEW_NAME_NORMAL = "P4";
    @ConfigurationParameter(name = PARAM_VIEW_NAME_NORMAL, mandatory = true)
    private String normalName;

    private File whitelistFile;

    private Iterator<List<String>> it;

    private int currentIndex;

    private final Pattern adaLine = Pattern.compile("(\\d+) (\\d+) (.*)");

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        whitelistFile = new File(whitelistFilePath);
        List<String> errs = new ArrayList<>();
        if (!whitelistFile.isFile()) {
            errs.add("The run file doesn't resolve to a file.");
        } else if (!whitelistFile.canRead()) {
            errs.add("The run file can't be read.");
        }
        if (!errs.isEmpty()) {
            logger.error(errs.stream().collect(Collectors.joining("\n")));
            throw new ResourceInitializationException();
        }

        try {
            Set<String> lines = new HashSet<>(FileUtils.readLines(whitelistFile, StandardCharsets.UTF_8));
            it = Sets.cartesianProduct(lines, lines).iterator();
        } catch (FileNotFoundException e) {
            logger.error("Couldn't find the run file.", e);
            throw new ResourceInitializationException(e);
        } catch (IOException e) {
            logger.error("Couldn't read the whitelist file.", e);
            throw new ResourceInitializationException(e);
        }
        currentIndex = 0;
    }

    @Override
    public void getNext(JCas jcas) throws IOException, CollectionException {
        JCas eleV, briV;
        try {
            eleV = ViewCreatorAnnotator.createViewSafely(jcas, eleName);
            briV = ViewCreatorAnnotator.createViewSafely(jcas, normalName);
        } catch (AnalysisEngineProcessException ex) {
            throw new CollectionException(ex);
        }
        jcas.setDocumentLanguage("en");
        eleV.setDocumentLanguage("en");
        briV.setDocumentLanguage("en");
        List<String> cities = it.next();
        String eleFilepath = cities.get(0).trim(), normalFilepath = cities.get(1).trim(),
                name = eleFilepath + "-" + normalFilepath;
        logger.info("processing " + name);
        File ele = new File(corpusPath, eleFilepath + "-ele.ada"),
                bri = new File(corpusPath, normalFilepath + "-bri.ada");

        StringBuilder eleSb = new StringBuilder(), normalSb = new StringBuilder();

        handleAda(ele, eleSb, eleV);
        handleAda(bri, normalSb, briV);
        eleV.setDocumentText(eleSb.toString());
        briV.setDocumentText(normalSb.toString());
        jcas.setDocumentText("The default CAS stays empty in this pipeline.");
        for (JCas j : new JCas[] { eleV, briV, jcas }) {
            Document document = new Document(j, 0, j.getDocumentText().length() - 1);
            document.setName(name);
            document.addToIndexes();
        }
        ++currentIndex;
    }

    @SuppressWarnings("null")
    private void handleAda(File file, StringBuilder sb, JCas jcas) throws IOException {
        try (BufferedReader br = new BufferedReader(new FileReader(file))) {
            String line;
            int previousParId = -1;
            int endOffset = 0;
            Paragraph currentParagraph = null;
            while ((line = br.readLine()) != null) {
                Matcher m = adaLine.matcher(line);
                if (m.matches()) {
                    int parId = Integer.parseInt(m.group(2));
                    if (previousParId != parId) {
                        if (parId != 1) {
                            sb.append("\n\n");
                            endOffset += 2;
                        }
                        currentParagraph = new Paragraph(jcas);
                        currentParagraph.setBegin(endOffset);
                        currentParagraph.addToIndexes();
                    } else {
                        sb.append(' ');
                        ++endOffset;
                    }
                    Sentence sentence = new Sentence(jcas);
                    sentence.setBegin(endOffset);
                    String[] tokens = m.group(3).split(" ");
                    for (int i = 0, l = tokens.length; i < l; ++i) {
                        if (i != 0) {
                            sb.append(' ');
                            ++endOffset;
                        }
                        Token token = new Token(jcas);
                        token.setLemma(tokens[i].toLowerCase(Locale.ENGLISH));
                        token.setBegin(endOffset);
                        endOffset += tokens[i].length();
                        token.setEnd(endOffset);
                        token.addToIndexes();
                        sb.append(tokens[i]);
                    }
                    sentence.setEnd(endOffset);
                    if (previousParId == parId) {
                        currentParagraph.setEnd(endOffset);
                    }
                    sentence.addToIndexes();
                    previousParId = parId;
                }
            }
        }
    }

    @Override
    public boolean hasNext() throws IOException, CollectionException {
        return it.hasNext();
    }

    @Override
    public Progress[] getProgress() {
        return new Progress[] { new ProgressImpl(currentIndex, -1, Progress.ENTITIES) };
    }
}