Java tutorial
/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.argumentation.cleaning; import org.apache.commons.io.FileUtils; import java.io.File; import java.util.Collection; /** * Removes non-standard characters from the data * * @author Ivan Habernal */ public class DataCleaner { public static void main(String[] args) throws Exception { // default path File dataDir = new File("data/"); // or from parameters if (args.length > 0) { dataDir = new File(args[0]); } Collection<File> files = FileUtils.listFiles(dataDir, new String[] { "txt" }, true); if (files.isEmpty()) { throw new IllegalArgumentException("No .txt files found in " + dataDir); } for (File file : files) { String text = FileUtils.readFileToString(file, "utf-8"); // cleaning String normalized = TextCleaningUtils.normalize(text); // and write back FileUtils.writeStringToFile(file, normalized, "utf-8"); } } }