List of usage examples for org.apache.commons.csv CSVFormat withEscape
public CSVFormat withEscape(final Character escape)
From source file:org.languagetool.rules.spelling.suggestions.SuggestionChangesTest.java
public void testChanges() throws IOException, InterruptedException { File configFile = new File(System.getProperty("config", "SuggestionChangesTestConfig.json")); ObjectMapper mapper = new ObjectMapper(new JsonFactory().enable(JsonParser.Feature.ALLOW_COMMENTS)); SuggestionChangesTestConfig config = mapper.readValue(configFile, SuggestionChangesTestConfig.class); SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd_HH:mm:ss"); String timestamp = dateFormat.format(new Date()); Path loggingFile = Paths.get(config.logDir, String.format("suggestionChangesExperiment_%s.log", timestamp)); Path datasetFile = Paths.get(config.logDir, String.format("suggestionChangesExperiment_%s.csv", timestamp)); BufferedWriter writer = Files.newBufferedWriter(loggingFile); CSVPrinter datasetWriter = new CSVPrinter(Files.newBufferedWriter(datasetFile), CSVFormat.DEFAULT.withEscape('\\')); List<String> datasetHeader = new ArrayList<>( Arrays.asList("sentence", "correction", "covered", "replacement", "dataset_id")); SuggestionsChanges.init(config, writer); writer.write("Evaluation configuration: \n"); String configContent = String.join("\n", Files.readAllLines(configFile.toPath())); writer.write(configContent);/*ww w .j a v a2 s . c om*/ writer.write("\nRunning experiments: \n"); int experimentId = 0; for (SuggestionChangesExperiment experiment : SuggestionsChanges.getInstance().getExperiments()) { experimentId++; writer.write(String.format("#%d: %s%n", experimentId, experiment)); datasetHeader.add(String.format("experiment_%d_suggestions", experimentId)); datasetHeader.add(String.format("experiment_%d_metadata", experimentId)); datasetHeader.add(String.format("experiment_%d_suggestions_metadata", experimentId)); } writer.newLine(); datasetWriter.printRecord(datasetHeader); BlockingQueue<SuggestionTestData> tasks = new LinkedBlockingQueue<>(1000); ConcurrentLinkedQueue<Pair<SuggestionTestResultData, String>> results = new ConcurrentLinkedQueue<>(); List<SuggestionTestThread> threads = new ArrayList<>(); for (int i = 0; i < Runtime.getRuntime().availableProcessors(); i++) { SuggestionTestThread worker = new SuggestionTestThread(tasks, results); worker.start(); threads.add(worker); } // Thread for writing results from worker threads into CSV Thread logger = new Thread(() -> { try { long messages = 0; //noinspection InfiniteLoopStatement while (true) { Pair<SuggestionTestResultData, String> message = results.poll(); if (message != null) { writer.write(message.getRight()); SuggestionTestResultData result = message.getLeft(); int datasetId = 1 + config.datasets.indexOf(result.getInput().getDataset()); if (result != null && result.getSuggestions() != null && !result.getSuggestions().isEmpty() && result.getSuggestions().stream() .noneMatch(m -> m.getSuggestedReplacements() == null || m.getSuggestedReplacements().isEmpty())) { List<Object> record = new ArrayList<>(Arrays.asList(result.getInput().getSentence(), result.getInput().getCorrection(), result.getInput().getCovered(), result.getInput().getReplacement(), datasetId)); for (RuleMatch match : result.getSuggestions()) { List<String> suggestions = match.getSuggestedReplacements(); record.add(mapper.writeValueAsString(suggestions)); // features extracted by SuggestionsOrdererFeatureExtractor record.add(mapper.writeValueAsString(match.getFeatures())); List<SortedMap<String, Float>> suggestionsMetadata = new ArrayList<>(); for (SuggestedReplacement replacement : match.getSuggestedReplacementObjects()) { suggestionsMetadata.add(replacement.getFeatures()); } record.add(mapper.writeValueAsString(suggestionsMetadata)); } datasetWriter.printRecord(record); } if (++messages % 1000 == 0) { writer.flush(); System.out.printf("Evaluated %d corrections.%n", messages); } } } } catch (IOException e) { throw new RuntimeException(e); } }); logger.setDaemon(true); logger.start(); // format straight from database dump String[] header = { "id", "sentence", "correction", "language", "rule_id", "suggestion_pos", "accept_language", "country", "region", "created_at", "updated_at", "covered", "replacement", "text_session_id", "client" }; int datasetId = 0; // read data, send to worker threads via queue for (SuggestionChangesDataset dataset : config.datasets) { writer.write(String.format("Evaluating dataset #%d: %s.%n", ++datasetId, dataset)); CSVFormat format = CSVFormat.DEFAULT; if (dataset.type.equals("dump")) { format = format.withEscape('\\').withNullString("\\N").withHeader(header); } else if (dataset.type.equals("artificial")) { format = format.withEscape('\\').withFirstRecordAsHeader(); } try (CSVParser parser = new CSVParser(new FileReader(dataset.path), format)) { for (CSVRecord record : parser) { String lang = record.get("language"); String rule = dataset.type.equals("dump") ? record.get("rule_id") : ""; String covered = record.get("covered"); String replacement = record.get("replacement"); String sentence = record.get("sentence"); String correction = record.isSet("correction") ? record.get("correction") : ""; String acceptLanguage = dataset.type.equals("dump") ? record.get("accept_language") : ""; if (sentence == null || sentence.trim().isEmpty()) { continue; } if (!config.language.equals(lang)) { continue; // TODO handle auto maybe? } if (dataset.type.equals("dump") && !config.rule.equals(rule)) { continue; } // correction column missing in export from doccano; workaround if (dataset.enforceCorrect && !record.isSet("correction")) { throw new IllegalStateException("enforceCorrect in dataset configuration enabled," + " but column 'correction' is not set for entry " + record); } if (dataset.type.equals("dump") && dataset.enforceAcceptLanguage) { if (acceptLanguage != null) { String[] entries = acceptLanguage.split(",", 2); if (entries.length == 2) { String userLanguage = entries[0]; // TODO: what to do with e.g. de-AT,de-DE;... if (!config.language.equals(userLanguage)) { continue; } } } } tasks.put(new SuggestionTestData(lang, sentence, covered, replacement, correction, dataset)); } } } for (Thread t : threads) { t.join(); } logger.join(10000L); logger.interrupt(); datasetWriter.close(); }
From source file:trainer.userinput.TrainingFileDB.java
public static CSVFormat getCSVFormat() { // Create the CSVFormat object with "\n" as a record delimiter CSVFormat csvFileFormat = CSVFormat.TDF.withRecordSeparator(NEW_LINE_SEPARATOR); csvFileFormat = csvFileFormat.withEscape('^'); csvFileFormat = csvFileFormat.withQuoteMode(QuoteMode.NONE); return csvFileFormat; }