de.tudarmstadt.ukp.csniper.resbuild.stuff.BncLocalCorpusBuilder.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.csniper.resbuild.stuff.BncLocalCorpusBuilder.java

Source

/*******************************************************************************
 * Copyright 2013
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.csniper.resbuild.stuff;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createAggregateDescription;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createPrimitiveDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createCollectionReader;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.fit.pipeline.SimplePipeline;

import de.tudarmstadt.ukp.csniper.resbuild.ProgressLogger;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionMethod;
import de.tudarmstadt.ukp.dkpro.core.io.bincas.SerializedCasWriter;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordParser;

/** 
 * @author Erik-Ln Do Dinh
 */
public class BncLocalCorpusBuilder {
    private static final String COLLECTION_ID = "BNC";
    private static final String HADOOP_USER_HOME = "D:/ukp/data";
    private static final String INPUT_BNC_PATH = "jar:file:" + HADOOP_USER_HOME + "/BNC.zip!";
    private static final String INCLUSION_FILE = HADOOP_USER_HOME + "/inclusions.txt";
    private static final String EXCLUSION_FILE = HADOOP_USER_HOME + "/exclusions.txt";
    // $dir will automatically be replaced by the user home dir on the hdfs 
    private static final String OUTPUT_SER_CAS_PATH = HADOOP_USER_HOME + "/output2/" + COLLECTION_ID
            + "/serialized/";
    private static final String OUTPUT_CSV_PATH = HADOOP_USER_HOME + "/output2/" + COLLECTION_ID + "/csv/";
    private static final String CLASS_NAME = BncLocalCorpusBuilder.class.getSimpleName();

    public static CollectionReader buildCollectionReader() throws ResourceInitializationException {
        List<String> patterns = new ArrayList<String>();

        try {
            patterns.addAll(read(INCLUSION_FILE, "[+]**/"));
            System.out.println("Including documents specified in [" + INCLUSION_FILE + "].");
        } catch (IOException e) {
            patterns.add("[+]**/*.xml");
            System.out.println("No inclusions specified, parsing all BNC documents.");
        }

        try {
            patterns.addAll(read(EXCLUSION_FILE, "[-]**/"));
            System.out.println("Excluding documents specified in [" + EXCLUSION_FILE + "].");
        } catch (IOException e) {
            System.out.println("No exclusions specified, parsing all specified BNC documents.");
        }

        CollectionReader reader = createCollectionReader(BncReaderReloaded.class, BncReaderReloaded.PARAM_PATH,
                INPUT_BNC_PATH, BncReaderReloaded.PARAM_PATTERNS, patterns.toArray(new String[0]),
                BncReaderReloaded.PARAM_LANGUAGE, "en");
        return reader;
    }

    private static List<String> read(String aFile, String aPatternPrefix) throws IOException {
        List<String> patterns = new ArrayList<String>();
        for (String s : FileUtils.readLines(new File(aFile), "UTF-8")) {
            patterns.add(aPatternPrefix + s);
        }
        return patterns;
    }

    public static AnalysisEngineDescription buildMapperEngine() throws ResourceInitializationException {
        // rename collectionId to BNC (from the path where BNC is located)
        AnalysisEngineDescription rn = createPrimitiveDescription(Renamer.class, Renamer.PARAM_COLLECTION_ID,
                COLLECTION_ID);

        // parse
        AnalysisEngineDescription sp = createPrimitiveDescription(StanfordParser.class,
                StanfordParser.PARAM_WRITE_PENN_TREE, true, StanfordParser.PARAM_LANGUAGE, "en",
                StanfordParser.PARAM_VARIANT, "factored", StanfordParser.PARAM_QUOTE_BEGIN, new String[] { "" },
                StanfordParser.PARAM_QUOTE_END, new String[] { "" });

        // output as serialized cas
        AnalysisEngineDescription scw = createPrimitiveDescription(SerializedCasWriter.class,
                SerializedCasWriter.PARAM_COMPRESSION, CompressionMethod.XZ,
                SerializedCasWriter.PARAM_TARGET_LOCATION, OUTPUT_SER_CAS_PATH,
                SerializedCasWriter.PARAM_STRIP_EXTENSION, true);

        // output as csv for fast db import
        AnalysisEngineDescription csvw = createPrimitiveDescription(PennTreesToCsvWriter.class,
                PennTreesToCsvWriter.PARAM_PATH, OUTPUT_CSV_PATH);

        AnalysisEngineDescription log = createPrimitiveDescription(ProgressLogger.class,
                ProgressLogger.PARAM_BRIEF_OUTPUT, true);

        return createAggregateDescription(rn, sp, scw, csvw, log);
    }

    public static void main(String[] args) {
        try {
            SimplePipeline.runPipeline(buildCollectionReader(), buildMapperEngine());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}