de.citec.sc.corpus.CorpusLoader.java Source code

Introduction

Here is the source code for de.citec.sc.corpus.CorpusLoader.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package de.citec.sc.corpus;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.commons.lang3.StringEscapeUtils;

import utility.VariableID;

/**
 *
 * @author sherzod
 */
public class CorpusLoader {

    private String datasetsPath = "";

    public CorpusLoader() {
        datasetsPath += "src/main/resources/";
    }

    public CorpusLoader(boolean isRun) {

    }

    public enum CorpusName {

        CoNLLTraining, MicroTagging, CoNLLTesta, CoNLLTestb;
    }

    public DefaultCorpus loadCorpus(CorpusName corpusName) {
        DefaultCorpus c = new DefaultCorpus();

        List<Document> documents = new ArrayList<>();

        switch (corpusName) {
        case CoNLLTraining:
            documents = getCONLLDocs("training");
            break;
        case CoNLLTesta:
            documents = getCONLLDocs("testa");
            break;
        case CoNLLTestb:
            documents = getCONLLDocs("testb");
            break;
        case MicroTagging:
            documents = getMicroTaggingDocs();
            break;
        }

        c.addDocuments(documents);

        return c;
    }

    private List<String> readFileAsList(File file) {
        List<String> content = new ArrayList<>();
        try {
            FileInputStream fstream = new FileInputStream(file);
            DataInputStream in = new DataInputStream(fstream);
            BufferedReader br = new BufferedReader(new InputStreamReader(in));
            String strLine;

            while ((strLine = br.readLine()) != null) {
                content.add(strLine);
            }
            in.close();
        } catch (Exception e) {
            System.err.println("Error reading the file: " + file.getPath() + "\n" + e.getMessage());
        }

        return content;
    }

    private List<Document> getCONLLDocs(String dataset) {
        String file = datasetsPath + "dataset/conll/dataset.tsv";

        List<String> list = readFileAsList(new File(file));

        List<Annotation> goldSet = new ArrayList<Annotation>();

        String s = "";

        List<Document> docs = new ArrayList<Document>();

        String docName = "";

        for (String l : list) {
            if (l.startsWith("-DOCSTART-")) {

                if (!s.equals("")) {
                    //write out

                    Document d1 = new Document(s, docName);
                    List<Annotation> annotations = new ArrayList<>();
                    for (Annotation ann : goldSet) {
                        annotations.add(ann.clone());
                    }
                    d1.setGoldStandard(annotations);

                    if (dataset.equals("training")) {
                        if (!d1.getDocumentName().contains("testa") && !d1.getDocumentName().contains("testb")) {
                            if (!d1.getGoldStandard().isEmpty()) {
                                docs.add(d1);
                            }
                        }
                    } else if (dataset.equals("testa")) {
                        if (d1.getDocumentName().contains("testa")) {
                            if (!d1.getGoldStandard().isEmpty()) {
                                docs.add(d1);
                            }
                        }
                    } else if (dataset.equals("testb")) {
                        if (d1.getDocumentName().contains("testb")) {
                            if (!d1.getGoldStandard().isEmpty()) {
                                docs.add(d1);
                            }
                        }
                    }

                    goldSet.clear();
                    s = "";
                }
                docName = l;
            } else {
                String[] content = l.split("\t");

                if (content.length == 6) {
                    //do B I I 
                    //European   B   European Commission   European_Commission
                    if (content[1].equals("B")) {

                        int startIndex = s.length();
                        int endIndex = s.length() + content[2].length();

                        String label = content[2];

                        String uri = content[4].replace("http://en.wikipedia.org/wiki/", "");

                        label = StringEscapeUtils.unescapeJava(label);

                        try {
                            label = URLDecoder.decode(label, "UTF-8");
                        } catch (Exception e) {
                        }

                        uri = StringEscapeUtils.unescapeJava(uri);

                        try {
                            uri = URLDecoder.decode(uri, "UTF-8");
                        } catch (Exception e) {
                        }

                        Annotation a1 = new Annotation(label, uri, startIndex, endIndex,
                                new VariableID("A" + goldSet.size()));
                        goldSet.add(a1);
                        s += content[0] + " ";
                    } else {
                        s += content[0] + " ";
                    }
                }
                if (content.length == 4) {
                    //get string
                    s += content[0] + " ";
                }
                if (content.length == 1) {
                    if (l.equals("")) {
                        s += "\n";
                    } else {
                        s += content[0] + " ";
                    }
                }
            }
        }

        if (goldSet.size() > 0 && !s.equals("")) {

            //last doc
            Document d1 = new Document(s, docName);

            List<Annotation> annotations = new ArrayList<>();
            for (Annotation ann : goldSet) {
                annotations.add(ann.clone());
            }
            d1.setGoldStandard(annotations);

            if (dataset.equals("training")) {
                if (!d1.getDocumentName().contains("testa") && !d1.getDocumentName().contains("testb")) {
                    if (!d1.getGoldStandard().isEmpty()) {
                        docs.add(d1);
                    }
                }
            } else if (dataset.equals("testa")) {
                if (d1.getDocumentName().contains("testa")) {
                    if (!d1.getGoldStandard().isEmpty()) {
                        docs.add(d1);
                    }
                }
            } else if (dataset.equals("testb")) {
                if (d1.getDocumentName().contains("testb")) {
                    if (!d1.getGoldStandard().isEmpty()) {
                        docs.add(d1);
                    }
                }
            }

            //docs.add(d1);

            goldSet.clear();
            s = "";
        }

        return docs;
    }

    private List<Document> getMicroTaggingDocs() {
        List<Document> docs = new ArrayList<>();

        List<String> annotations = readFileAsList(new File(datasetsPath + "dataset/microtag/dataset.in"));
        List<String> tweetDocs = readFileAsList(new File(datasetsPath + "dataset/microtag/dataset_tweets.out"));

        HashMap<String, Document> tweetsHashMap = new HashMap();

        for (String s : tweetDocs) {
            String docId = s.split("\t")[0];
            String content = s.replace(docId, "");
            content = content.trim();

            Document d1 = new Document(content, docId);

            tweetsHashMap.put(docId, d1);
        }

        for (String s : annotations) {
            String docId = s.split("\t")[0];
            String ans = s.replace(docId, "");
            ans = ans.trim();

            if (!ans.equals("")) {

                String[] arrayOfAnnotations = ans.split("\t");
                List<Annotation> goldSet = new ArrayList<>();

                for (int i = 0; i < arrayOfAnnotations.length; i = i + 2) {

                    String label = arrayOfAnnotations[i];
                    String uri = arrayOfAnnotations[i + 1].replace("http://dbpedia.org/resource/", "");
                    label = StringEscapeUtils.unescapeJava(label);
                    uri = StringEscapeUtils.unescapeJava(uri);
                    try {
                        uri = URLDecoder.decode(uri, "UTF-8");
                    } catch (Exception e) {

                    }

                    try {
                        label = URLDecoder.decode(label, "UTF-8");
                    } catch (Exception e) {

                    }
                    Annotation a1 = new Annotation(label, uri, 0, 0, new VariableID("S" + goldSet.size()));
                    goldSet.add(a1);
                }

                if (tweetsHashMap.containsKey(docId)) {
                    Document d = tweetsHashMap.get(docId);

                    d.setGoldStandard(goldSet);

                    docs.add(d);
                }
            }

        }

        return docs;
    }
}