ie.pars.bnc.preprocess.MainBNCProcess.java Source code

Introduction

Here is the source code for ie.pars.bnc.preprocess.MainBNCProcess.java
Source

/* 
 * Copyright (C) 2016 Behrang QasemiZadeh <zadeh at phil.hhu.de>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package ie.pars.bnc.preprocess;

import edu.stanford.nlp.parser.common.ParserGrammar;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.TreeSet;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;

/**
 *
 * @author bq
 */
public class MainBNCProcess {

    private static MaxentTagger tagger;
    private static Morphology m;
    private static ParserGrammar parser;
    private static TreeSet<String> filesProcesed;
    static String letter;
    static String pathOutput;
    static String pathInput;

    /**
     * Main method use for processing BNC text. Claws PoSs are replaced by
     * Stanford CoreNLP results for consistency with the rest of data! Currently
     * the structure of BNC is mainly discarded, only paragraphs and sentences
     *
     * @param sugare
     * @throws IOException
     * @throws ArchiveException
     * @throws Exception
     */
    public static void main(String[] sugare) throws IOException, ArchiveException, Exception {
        pathInput = sugare[0];
        pathOutput = sugare[1];
        letter = sugare[2];
        filesProcesed = new TreeSet();

        File folder = new File(pathOutput);
        if (folder.exists()) {
            for (File f : folder.listFiles()) {
                if (f.isFile()) {
                    String pfile = f.getName().split("\\.")[0];
                    filesProcesed.add(pfile);
                }
            }
        } else {
            folder.mkdirs();
        }
        getZippedFile();
    }

    private static void getZippedFile() throws IOException, ArchiveException, Exception {
        String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
        String parseModel = LexicalizedParser.DEFAULT_PARSER_LOC;

        InputStream is = new FileInputStream(pathInput);
        TarArchiveInputStream tarStream = (TarArchiveInputStream) new ArchiveStreamFactory()
                .createArchiveInputStream("tar", is);
        TarArchiveEntry entry = null;
        int countfiles = 0;
        while ((entry = (TarArchiveEntry) tarStream.getNextEntry()) != null) {
            //     for(File lf: listFiles){ 
            if (!entry.isDirectory()) {

                byte[] content = new byte[(int) entry.getSize()];
                int offset = 0;
                tarStream.read(content, offset, content.length - offset);
                String id = entry.getName().split("/")[entry.getName().split("/").length - 1].split(".xml")[0];

                if (!filesProcesed.contains(id) && id.startsWith(letter.toUpperCase())) {
                    if (countfiles++ % 10 == 0) {
                        tagger = new MaxentTagger(taggerPath);
                        m = new Morphology();
                        parser = ParserGrammar.loadModel(parseModel);
                        parser.loadTagger();
                    }
                    System.out.print("Entry " + entry.getName());

                    InputStream bis = new ByteArrayInputStream(content);
                    StringBuilder parseBNCXML = ProcessNLP.parseBNCXML(bis, m, tagger, parser);
                    bis.close();
                    OutputStream out = new FileOutputStream(pathOutput + File.separatorChar + id + ".vert");
                    Writer writer = new OutputStreamWriter(out, "UTF-8");

                    writer.write("<text id=\"" + id + "\">\n");
                    writer.write(parseBNCXML.toString());
                    writer.write("</text>\n");
                    writer.close();
                    out.close();
                } else {
                    System.out.println(">> Bypass Entry " + entry.getName());
                }
                //break;
            }

        }
        is.close();
        System.out.println("There are " + countfiles);
        //    tarStream.close();

    }

}