it.drwolf.ridire.utility.RIDIREPlainTextCleaner.java Source code

Java tutorial

Introduction

Here is the source code for it.drwolf.ridire.utility.RIDIREPlainTextCleaner.java

Source

/*******************************************************************************
 * Copyright 2013 Universit degli Studi di Firenze
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package it.drwolf.ridire.utility;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;

public class RIDIREPlainTextCleaner {

    private class PlainTextFileFilter implements FileFilter {

        public boolean accept(File f) {
            if (f != null && f.canRead() && f.getAbsolutePath().endsWith("txt")) {
                return true;
            }
            return false;
        }

    }

    /**
     * @param args
     */
    public static void main(String[] args) {
        new RIDIREPlainTextCleaner(args);

    }

    private Options options;
    private String dirName;
    private boolean onlyDirsWithSpaces;

    public RIDIREPlainTextCleaner(String[] args) {
        if (args != null) {
            this.createOptions();
            this.parseOptions(args);
            File[] files = this.getPlainTextFiles();
            for (File f : files) {
                System.out.print("Cleaning file: " + f.getName() + "...");
                try {
                    this.cleanTextFile(f);
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                System.out.println(" done.");
            }
        }
    }

    public void cleanTextFile(File f) throws IOException {
        File tmpFile = new File(f.getCanonicalPath() + ".tmp");
        FileUtils.writeStringToFile(tmpFile, this.getCleanText(f));
        FileUtils.deleteQuietly(f);
        FileUtils.moveFile(tmpFile, f);
    }

    private void createOptions() {
        this.options = new Options();
        Option dir = new Option("d", "dir", true, "input directory");
        this.options.addOption(dir);
        Option onlyDirsWithSpaces = new Option("s", "dirWithSpaces", false, "Process only dirs with spaces");
        this.options.addOption(onlyDirsWithSpaces);
    }

    public String getCleanText(File f) throws IOException {
        String fileContent = FileUtils.readFileToString(f, "UTF-8");
        fileContent = this.substitute(fileContent, "", "\"");
        fileContent = this.substitute(fileContent, "", "\"");
        fileContent = this.substitute(fileContent, "", "\"");
        fileContent = this.substitute(fileContent, "\\u0080\\u0099", "'");
        fileContent = this.substitute(fileContent, "\\u0080\\u009c", "\"");
        fileContent = this.substitute(fileContent, "\\u0080\\u009d", "\"");
        fileContent = this.substitute(fileContent, "\\u0080\\u0093", "-");
        fileContent = this.substitute(fileContent, "\\u0092", "'");
        fileContent = this.substitute(fileContent, "\\u0093", "'");
        fileContent = this.substitute(fileContent, "", "");
        fileContent = this.substitute(fileContent, "", "");
        fileContent = this.substitute(fileContent, "", "");
        fileContent = this.substitute(fileContent, "", "");
        fileContent = this.substitute(fileContent, "", "");
        fileContent = this.substitute(fileContent, "", "");
        fileContent = this.substitute(fileContent, "", "");
        fileContent = this.substitute(fileContent, "", "");
        fileContent = this.substitute(fileContent, "", "");
        fileContent = this.substitute(fileContent, "", "");
        fileContent = this.substitute(fileContent, "", "...");
        fileContent = this.substitute(fileContent, "Follow us on Twitter ", "");
        return fileContent;
    }

    private File[] getPlainTextFiles() {
        if (this.onlyDirsWithSpaces && !this.dirName.contains(" ")) {
            return new File[0];
        }
        File dir = new File(this.dirName);
        FileFilter filter = new PlainTextFileFilter();
        if (dir != null && dir.isDirectory()) {
            return dir.listFiles(filter);
        } else {
            return null;
        }
    }

    private void parseOptions(String[] args) {
        HelpFormatter formatter = new HelpFormatter();
        CommandLineParser parser = new GnuParser();
        CommandLine cmdline = null;
        try {
            // parse the command line arguments
            cmdline = parser.parse(this.options, args);
        } catch (ParseException exp) {
            // oops, something went wrong
            System.err.println("Parsing failed.  Reason: " + exp.getMessage());
            formatter.printHelp("RIDIREPlainTextCleaner", this.options);
            System.exit(-1);
        }
        if (cmdline != null) {
            this.dirName = cmdline.getOptionValue("d");
            if (this.dirName == null) {
                System.err.println("No directory provided.");
                formatter.printHelp("RIDIREPlainTextCleaner", this.options);
                System.exit(-1);
            }
            this.onlyDirsWithSpaces = cmdline.hasOption("s");
        }
    }

    private String substitute(String original, String regex, String subst) {
        Pattern p = Pattern.compile(regex, Pattern.MULTILINE);
        Matcher m = p.matcher(original);
        return m.replaceAll(subst);
    }

}