Java tutorial
/* * Copyright 2015 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.tc.svmhmm.util; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.PrintWriter; import java.net.URLDecoder; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import org.apache.commons.collections.BidiMap; import org.apache.commons.collections.bidimap.DualTreeBidiMap; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import de.tudarmstadt.ukp.dkpro.lab.engine.TaskContext; import de.tudarmstadt.ukp.dkpro.lab.storage.StorageService; import de.tudarmstadt.ukp.dkpro.tc.core.Constants; import de.tudarmstadt.ukp.dkpro.tc.core.ml.TCMachineLearningAdapter; import de.tudarmstadt.ukp.dkpro.tc.svmhmm.SVMHMMAdapter; import de.tudarmstadt.ukp.dkpro.tc.svmhmm.writer.SVMHMMDataWriter; /** * @author Ivan Habernal */ public final class SVMHMMUtils { /** * File name of serialized mapping from String labels to numbers */ public static final String LABELS_TO_INTEGERS_MAPPING_FILE_NAME = "labelsToIntegersMapping_DualTreeBidiMap.bin"; /** * CSV file comment */ public static final String CSV_COMMENT = "Columns: gold, predicted, token, seqID"; /** * Format of CSV files */ public static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withCommentMarker('#'); /** * Where the gold outcomes, predicted outcomes, and tokens are stored */ public static final String GOLD_PREDICTED_OUTCOMES_CSV = "outcomesGoldPredicted.csv"; private SVMHMMUtils() { // empty } /** * Extract all outcomes from featureVectorsFiles (training, test) that are in * LIBSVM format - each line is a feature vector and the first token is the outcome * label * * @param files files in LIBSVM format * @return set of all unique outcomes * @throws java.io.IOException */ public static SortedSet<String> extractOutcomeLabelsFromFeatureVectorFiles(File... files) throws IOException { SortedSet<String> result = new TreeSet<>(); for (File file : files) { result.addAll(extractOutcomeLabels(file)); } return result; } /** * Maps names to numbers (numbers are required by SVMLight format) * * @param names names (e.g., features, outcomes) * @return bidirectional map of name:number */ public static BidiMap mapVocabularyToIntegers(SortedSet<String> names) { BidiMap result = new DualTreeBidiMap(); // start numbering from 1 int index = 1; for (String featureName : names) { result.put(featureName, index); index++; } return result; } /** * Creates a new file in the same directory as {@code featureVectorsFile} and replaces the first * token (outcome label) by its corresponding integer number from the bi-di map * * @param featureVectorsFile file * @param labelsToIntegers mapping * @return new file */ public static File replaceLabelsWithIntegers(File featureVectorsFile, BidiMap labelsToIntegers) throws IOException { File result = new File(featureVectorsFile.getParent(), "mappedLabelsToInt_" + featureVectorsFile.getName()); PrintWriter pw = new PrintWriter(new FileOutputStream(result)); for (String line : FileUtils.readLines(featureVectorsFile)) { // split on the first whitespaces, keep the rest String[] split = line.split("\\s", 2); String label = split[0]; String remainingContent = split[1]; // find the integer Integer intOutput = (Integer) labelsToIntegers.get(label); // print to the output stream pw.printf("%d %s%n", intOutput, remainingContent); } IOUtils.closeQuietly(pw); return result; } /** * Saves label-integer mapping to a file * * @param mapping mapping * @param outputFile file * @throws IOException */ public static void saveMapping(BidiMap mapping, File outputFile) throws IOException { ObjectOutputStream objectOutputStream = new ObjectOutputStream(new FileOutputStream(outputFile)); objectOutputStream.writeObject(mapping); IOUtils.closeQuietly(objectOutputStream); } /** * Saves the feature mapping to readable format, each line is a feature id and feature name, * sorted by feature id * * @param mapping mapping (name:id) * @param outputFile output file * @throws IOException */ public static void saveMappingTextFormat(BidiMap mapping, File outputFile) throws IOException { PrintWriter pw = new PrintWriter(new FileOutputStream(outputFile)); // sort values (feature indexes) SortedSet<Object> featureIndexes = new TreeSet<Object>(mapping.values()); for (Object featureIndex : featureIndexes) { pw.printf(Locale.ENGLISH, "%5d %s%n", (int) featureIndex, mapping.getKey(featureIndex).toString()); } IOUtils.closeQuietly(pw); } /** * Loads a serialized BidiMap from file * * @param inputFile input file * @return BidiMap * @throws IOException */ public static BidiMap loadMapping(File inputFile) throws IOException { ObjectInputStream inputStream = new ObjectInputStream(new FileInputStream(inputFile)); try { return (BidiMap) inputStream.readObject(); } catch (ClassNotFoundException e) { throw new IOException(e); } finally { IOUtils.closeQuietly(inputStream); } } /** * Extracts the outcome labels from the file; it corresponds to the first token * on each line. * * @param featureVectorsFile featureVectors file * @return list of outcome labels * @throws IOException */ public static List<String> extractOutcomeLabels(File featureVectorsFile) throws IOException { List<String> result = new ArrayList<>(); List<String> lines = FileUtils.readLines(featureVectorsFile); for (String line : lines) { String label = line.split("\\s")[0]; result.add(label); } return result; } /** * Reads the featureVectorsFile and splits comment on each line into a list of strings, i.e. * "TAG qid:4 1:1 2:1 4:2 # token TAG 4" produces "token", "TAG", "4" * * @param featureVectorsFileStream featureVectors file stream * @return list (for each line) of list of comment parts * @throws IOException */ protected static List<List<String>> extractComments(InputStream featureVectorsFileStream // int expectedFieldsCount ) throws IOException, IllegalArgumentException { List<List<String>> result = new ArrayList<>(); List<String> lines = IOUtils.readLines(featureVectorsFileStream); IOUtils.closeQuietly(featureVectorsFileStream); for (String line : lines) { String comment = line.split("#", 2)[1]; List<String> list = new ArrayList<>(); String[] tokens = comment.split("\\s+"); // filter empty tokens for (String token : tokens) { String trim = token.trim(); if (!trim.isEmpty()) { // decode from URL representation String s = URLDecoder.decode(trim, "utf-8"); list.add(s); } } result.add(list); } return result; } /** * Extracts original tokens that are stored in the comment part of the featureVectorsFile * * @param featureVectorsFile featureVectors file * @return list of original tokens * @throws IOException */ public static List<String> extractOriginalTokens(File featureVectorsFile) throws IOException { List<String> result = new ArrayList<>(); List<List<String>> comments = extractComments(new FileInputStream(featureVectorsFile)); for (List<String> comment : comments) { // original token is the first one in comments result.add(comment.get(2)); } return result; } /** * Reads the prediction file (each line is a integer) and converts them into original outcome * labels using the mapping provided by the bi-directional map * * @param predictionsFile predictions from classifier * @param labelsToIntegersMapping mapping outcomeLabel:integer * @return list of outcome labels * @throws IOException */ public static List<String> extractOutcomeLabelsFromPredictions(File predictionsFile, BidiMap labelsToIntegersMapping) throws IOException { List<String> result = new ArrayList<>(); for (String line : FileUtils.readLines(predictionsFile)) { Integer intLabel = Integer.valueOf(line); String outcomeLabel = (String) labelsToIntegersMapping.getKey(intLabel); result.add(outcomeLabel); } return result; } /** * Returns a list of original sequence IDs extracted from comments * * @param featureVectorsFile featureVectors file * @return list of integers * @throws IOException */ public static List<Integer> extractOriginalSequenceIDs(File featureVectorsFile) throws IOException { List<Integer> result = new ArrayList<>(); List<List<String>> comments = extractComments(new FileInputStream(featureVectorsFile)); for (List<String> comment : comments) { // sequence number is the third token in the comment token result.add(Integer.valueOf(comment.get(1))); } return result; } /** * Given confusion matrix, it writes it in CSV and LaTeX form to the tasks output directory, * and also prints evaluations (F-measure, Precision, Recall) * * @param context task context * @param confusionMatrix confusion matrix * @throws java.io.IOException */ public static void writeOutputResults(TaskContext context, ConfusionMatrix confusionMatrix) throws IOException { writeOutputResults(context, confusionMatrix, null); } /** * Given confusion matrix, it writes it in CSV and LaTeX form to the tasks output directory, * and also prints evaluations (F-measure, Precision, Recall) * * @param context task context * @param confusionMatrix confusion matrix * @param filePrefix prefix of output files * @throws java.io.IOException */ public static void writeOutputResults(TaskContext context, ConfusionMatrix confusionMatrix, String filePrefix) throws IOException { // storing the results as latex confusion matrix String confMatrixFileTex = (filePrefix != null ? filePrefix : "") + "confusionMatrix.tex"; File evaluationFileLaTeX = new File( context.getStorageLocation(Constants.TEST_TASK_OUTPUT_KEY, StorageService.AccessMode.READWRITE), confMatrixFileTex); FileUtils.writeStringToFile(evaluationFileLaTeX, confusionMatrix.toStringLatex()); // as CSV confusion matrix String confMatrixFileCsv = (filePrefix != null ? filePrefix : "") + "confusionMatrix.csv"; File evaluationFileCSV = new File( context.getStorageLocation(Constants.TEST_TASK_OUTPUT_KEY, StorageService.AccessMode.READWRITE), confMatrixFileCsv); CSVPrinter csvPrinter = new CSVPrinter(new FileWriter(evaluationFileCSV), CSVFormat.DEFAULT); csvPrinter.printRecords(confusionMatrix.toStringMatrix()); IOUtils.closeQuietly(csvPrinter); // and results File evaluationFile = new File( context.getStorageLocation(Constants.TEST_TASK_OUTPUT_KEY, StorageService.AccessMode.READWRITE), new SVMHMMAdapter() .getFrameworkFilename(TCMachineLearningAdapter.AdapterNameEntries.evaluationFile)); PrintWriter pw = new PrintWriter(evaluationFile); pw.println(confusionMatrix.printNiceResults()); pw.println(confusionMatrix.printLabelPrecRecFm()); pw.println(confusionMatrix.printClassDistributionGold()); IOUtils.closeQuietly(pw); } public static List<SortedMap<String, String>> extractMetaDataFeatures(File featureVectorsFile) throws IOException { InputStream inputStream = new FileInputStream(featureVectorsFile); List<SortedMap<String, String>> result = new ArrayList<>(); List<List<String>> allComments = extractComments(inputStream); for (List<String> instanceComments : allComments) { SortedMap<String, String> instanceResult = new TreeMap<>(); for (String comment : instanceComments) { if (comment.startsWith(SVMHMMDataWriter.META_DATA_FEATURE_PREFIX)) { String[] split = comment.split(":"); String key = split[0]; String value = split[1]; instanceResult.put(key, value); } } result.add(instanceResult); } IOUtils.closeQuietly(inputStream); return result; } }