de.tudarmstadt.ukp.csniper.resbuild.stuff.InclusionsCreator.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.csniper.resbuild.stuff.InclusionsCreator.java

Source

/*******************************************************************************
 * Copyright 2013
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.csniper.resbuild.stuff;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.junit.Test;

public class InclusionsCreator {
    private static final String BASE_0 = "D:\\ukp\\data\\bnc\\xml";
    private static final String BASE = "D:\\hadoop\\output\\BNC_new";

    @Test
    public void createInclusionsFile() throws IOException {
        List<String> all = read(BASE_0, "xml");
        //      all.removeAll(cut(read(BASE, "csv"), read(BASE, "xz")));
        all.removeAll(cut(read(BASE, "csv"), stripPlusXml(read("serialized.txt"))));
        //      all.removeAll(read(BASE, "csv"));
        FileUtils.writeLines(new File(BASE, "inclusions.txt"), "UTF-8", all);
        //      all.removeAll(read(BASE, "xz"));
        //      all.removeAll(read(BASE, "csv"));
        IOUtils.writeLines(all, "\n", System.out, "UTF-8");
    }

    private List<String> read(String aFile) throws IOException {
        return FileUtils.readLines(new File(BASE, aFile), "UTF-8");
    }

    private List<String> read(String aDir, String... aExts) throws IOException {
        List<String> ret = new ArrayList<String>();
        for (File f : FileUtils.listFiles(new File(aDir), aExts, true)) {
            ret.add(stripPlusXml(f.getName()));
        }
        return ret;
    }

    private String stripPlusXml(String aStr) {
        return StringUtils.substringBefore(aStr, ".") + ".xml";
    }

    private List<String> stripPlusXml(List<String> aList) {
        List<String> ret = new ArrayList<String>();
        for (String s : aList) {
            ret.add(stripPlusXml(s));
        }
        return ret;
    }

    private List<String> cut(List<String> aList1, List<String> aList2) {
        List<String> ret = new ArrayList<String>(aList1);
        ret.retainAll(aList2);
        return ret;
    }
}