it.drwolf.ridire.utility.CorpusPackager.java Source code

Java tutorial

Introduction

Here is the source code for it.drwolf.ridire.utility.CorpusPackager.java

Source

/*******************************************************************************
 * Copyright 2013 Universit degli Studi di Firenze
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package it.drwolf.ridire.utility;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;

import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.RegexFileFilter;
import org.apache.commons.lang.text.StrTokenizer;

public class CorpusPackager {
    public static void main(String[] args) {
        new CorpusPackager(args);
    }

    private Options options;
    private String dirName;
    private String descFile;
    StrTokenizer strTokenizer = new StrTokenizer();
    private Integer minValue;
    private Integer maxValue;

    public CorpusPackager(String[] args) {
        this.createOptions();
        this.parseOptions(args);
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(this.descFile));
            String fileLine = null;
            while ((fileLine = bufferedReader.readLine()) != null) {
                this.processLine(fileLine);
            }
            bufferedReader.close();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    private void createOptions() {
        this.options = new Options();
        Option f = new Option("f", "descfile", true, "description file");
        this.options.addOption(f);
        Option dir = new Option("d", "destDir", true, "destination dir");
        this.options.addOption(dir);
        Option min = new Option("m", "min", true, "min value");
        this.options.addOption(min);
        Option max = new Option("M", "max", true, "max value");
        this.options.addOption(max);
    }

    private void parseOptions(String[] args) {
        HelpFormatter formatter = new HelpFormatter();
        CommandLineParser parser = new GnuParser();
        org.apache.commons.cli.CommandLine cmdline = null;
        try {
            // parse the command line arguments
            cmdline = parser.parse(this.options, args);
        } catch (ParseException exp) {
            // oops, something went wrong
            System.err.println("Parsing failed.  Reason: " + exp.getMessage());
            formatter.printHelp("CorpusPackager", this.options);
            System.exit(-1);
        }
        if (cmdline != null) {
            this.dirName = cmdline.getOptionValue("d");
            if (this.dirName == null) {
                System.err.println("No directory provided.");
                formatter.printHelp("CorpusPackager", this.options);
                System.exit(-1);
            }
            this.descFile = cmdline.getOptionValue("f");
            if (this.descFile == null) {
                System.err.println("No description file provided.");
                formatter.printHelp("CorpusPackager", this.options);
                System.exit(-1);
            }
            String min = cmdline.getOptionValue("m");
            if (min == null) {
                System.err.println("No min value provided.");
                formatter.printHelp("CorpusPackager", this.options);
                System.exit(-1);
            }
            this.minValue = Integer.parseInt(min);
            String max = cmdline.getOptionValue("M");
            if (max == null) {
                System.err.println("No max value provided.");
                formatter.printHelp("CorpusPackager", this.options);
                System.exit(-1);
            }
            this.maxValue = Integer.parseInt(max);
        }
    }

    private void processLine(String fileLine) throws IOException {
        this.strTokenizer.setIgnoreEmptyTokens(false);
        this.strTokenizer.reset(fileLine);
        String[] tokens = this.strTokenizer.getTokenArray();
        if (tokens.length == 5) {
            Integer words = Integer.parseInt(tokens[0].trim());
            String jobName = tokens[1].trim();
            FileFilter fileFilter = new RegexFileFilter(jobName.replaceAll("_", "[_ ]"));
            String jobsDir = "/home/drwolf/heritrix-3.1.1-SNAPSHOT/jobs/";
            File[] jobsDirs = new File(jobsDir).listFiles(fileFilter);
            File jobDirFile = null;
            if (jobsDirs.length > 0) {
                jobDirFile = jobsDirs[0];
            } else {
                System.out.println(jobName + "\tnot found.");
                return;
            }
            String fileDigest = tokens[2].trim();
            if (words < this.minValue || words >= this.maxValue) {
                System.out.println(jobName + "\t" + fileDigest + "\t skipped: out of range.");
                return;
            }
            String functional = tokens[3].trim();
            String semantic = tokens[4].trim();
            String dirname = functional.trim();
            if (dirname.trim().length() < 1) {
                dirname = semantic.trim();
            }
            if (dirname.trim().length() < 1) {
                dirname = "other";
            }
            File destDir = new File(this.dirName + System.getProperty("file.separator") + dirname);
            File f = new File(jobDirFile.getAbsolutePath() + "/arcs/resources/" + fileDigest + ".txt");
            if (!f.exists() || !f.canRead()) {
                f = new File("/home/drwolf/heritrix-3.1.1-SNAPSHOT/jobs/completed-" + jobName + "/arcs/resources/"
                        + fileDigest + ".txt");
                if (!f.exists() || !f.canRead()) {
                    System.out.println(jobName + "\t" + fileDigest + "\tnot found.");
                    return;
                }
            }
            FileUtils.copyFileToDirectory(f, destDir);
        }
    }
}