Example usage for org.apache.commons.csv CSVFormat withEscape

List of usage examples for org.apache.commons.csv CSVFormat withEscape

Introduction

In this page you can find the example usage for org.apache.commons.csv CSVFormat withEscape.

Prototype

public CSVFormat withEscape(final Character escape) 

Source Link

Document

Sets the escape character of the format to the specified character.

Usage

From source file:org.languagetool.rules.spelling.suggestions.SuggestionChangesTest.java

public void testChanges() throws IOException, InterruptedException {

    File configFile = new File(System.getProperty("config", "SuggestionChangesTestConfig.json"));
    ObjectMapper mapper = new ObjectMapper(new JsonFactory().enable(JsonParser.Feature.ALLOW_COMMENTS));
    SuggestionChangesTestConfig config = mapper.readValue(configFile, SuggestionChangesTestConfig.class);

    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd_HH:mm:ss");
    String timestamp = dateFormat.format(new Date());
    Path loggingFile = Paths.get(config.logDir, String.format("suggestionChangesExperiment_%s.log", timestamp));
    Path datasetFile = Paths.get(config.logDir, String.format("suggestionChangesExperiment_%s.csv", timestamp));

    BufferedWriter writer = Files.newBufferedWriter(loggingFile);
    CSVPrinter datasetWriter = new CSVPrinter(Files.newBufferedWriter(datasetFile),
            CSVFormat.DEFAULT.withEscape('\\'));
    List<String> datasetHeader = new ArrayList<>(
            Arrays.asList("sentence", "correction", "covered", "replacement", "dataset_id"));

    SuggestionsChanges.init(config, writer);
    writer.write("Evaluation configuration: \n");
    String configContent = String.join("\n", Files.readAllLines(configFile.toPath()));
    writer.write(configContent);/*ww  w  .j  a  v a2 s  .  c om*/
    writer.write("\nRunning experiments: \n");
    int experimentId = 0;
    for (SuggestionChangesExperiment experiment : SuggestionsChanges.getInstance().getExperiments()) {
        experimentId++;
        writer.write(String.format("#%d: %s%n", experimentId, experiment));
        datasetHeader.add(String.format("experiment_%d_suggestions", experimentId));
        datasetHeader.add(String.format("experiment_%d_metadata", experimentId));
        datasetHeader.add(String.format("experiment_%d_suggestions_metadata", experimentId));
    }
    writer.newLine();
    datasetWriter.printRecord(datasetHeader);

    BlockingQueue<SuggestionTestData> tasks = new LinkedBlockingQueue<>(1000);
    ConcurrentLinkedQueue<Pair<SuggestionTestResultData, String>> results = new ConcurrentLinkedQueue<>();
    List<SuggestionTestThread> threads = new ArrayList<>();
    for (int i = 0; i < Runtime.getRuntime().availableProcessors(); i++) {
        SuggestionTestThread worker = new SuggestionTestThread(tasks, results);
        worker.start();
        threads.add(worker);
    }

    // Thread for writing results from worker threads into CSV
    Thread logger = new Thread(() -> {
        try {
            long messages = 0;
            //noinspection InfiniteLoopStatement
            while (true) {
                Pair<SuggestionTestResultData, String> message = results.poll();
                if (message != null) {
                    writer.write(message.getRight());

                    SuggestionTestResultData result = message.getLeft();
                    int datasetId = 1 + config.datasets.indexOf(result.getInput().getDataset());
                    if (result != null && result.getSuggestions() != null && !result.getSuggestions().isEmpty()
                            && result.getSuggestions().stream()
                                    .noneMatch(m -> m.getSuggestedReplacements() == null
                                            || m.getSuggestedReplacements().isEmpty())) {

                        List<Object> record = new ArrayList<>(Arrays.asList(result.getInput().getSentence(),
                                result.getInput().getCorrection(), result.getInput().getCovered(),
                                result.getInput().getReplacement(), datasetId));
                        for (RuleMatch match : result.getSuggestions()) {
                            List<String> suggestions = match.getSuggestedReplacements();
                            record.add(mapper.writeValueAsString(suggestions));
                            // features extracted by SuggestionsOrdererFeatureExtractor
                            record.add(mapper.writeValueAsString(match.getFeatures()));
                            List<SortedMap<String, Float>> suggestionsMetadata = new ArrayList<>();
                            for (SuggestedReplacement replacement : match.getSuggestedReplacementObjects()) {
                                suggestionsMetadata.add(replacement.getFeatures());
                            }
                            record.add(mapper.writeValueAsString(suggestionsMetadata));
                        }
                        datasetWriter.printRecord(record);
                    }

                    if (++messages % 1000 == 0) {
                        writer.flush();
                        System.out.printf("Evaluated %d corrections.%n", messages);
                    }
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    });
    logger.setDaemon(true);
    logger.start();

    // format straight from database dump
    String[] header = { "id", "sentence", "correction", "language", "rule_id", "suggestion_pos",
            "accept_language", "country", "region", "created_at", "updated_at", "covered", "replacement",
            "text_session_id", "client" };

    int datasetId = 0;
    // read data, send to worker threads via queue
    for (SuggestionChangesDataset dataset : config.datasets) {

        writer.write(String.format("Evaluating dataset #%d: %s.%n", ++datasetId, dataset));

        CSVFormat format = CSVFormat.DEFAULT;
        if (dataset.type.equals("dump")) {
            format = format.withEscape('\\').withNullString("\\N").withHeader(header);
        } else if (dataset.type.equals("artificial")) {
            format = format.withEscape('\\').withFirstRecordAsHeader();
        }
        try (CSVParser parser = new CSVParser(new FileReader(dataset.path), format)) {
            for (CSVRecord record : parser) {

                String lang = record.get("language");
                String rule = dataset.type.equals("dump") ? record.get("rule_id") : "";
                String covered = record.get("covered");
                String replacement = record.get("replacement");
                String sentence = record.get("sentence");
                String correction = record.isSet("correction") ? record.get("correction") : "";
                String acceptLanguage = dataset.type.equals("dump") ? record.get("accept_language") : "";

                if (sentence == null || sentence.trim().isEmpty()) {
                    continue;
                }

                if (!config.language.equals(lang)) {
                    continue; // TODO handle auto maybe?
                }
                if (dataset.type.equals("dump") && !config.rule.equals(rule)) {
                    continue;
                }

                // correction column missing in export from doccano; workaround
                if (dataset.enforceCorrect && !record.isSet("correction")) {
                    throw new IllegalStateException("enforceCorrect in dataset configuration enabled,"
                            + " but column 'correction' is not set for entry " + record);
                }

                if (dataset.type.equals("dump") && dataset.enforceAcceptLanguage) {
                    if (acceptLanguage != null) {
                        String[] entries = acceptLanguage.split(",", 2);
                        if (entries.length == 2) {
                            String userLanguage = entries[0]; // TODO: what to do with e.g. de-AT,de-DE;...
                            if (!config.language.equals(userLanguage)) {
                                continue;
                            }
                        }
                    }
                }

                tasks.put(new SuggestionTestData(lang, sentence, covered, replacement, correction, dataset));
            }
        }

    }

    for (Thread t : threads) {
        t.join();
    }
    logger.join(10000L);
    logger.interrupt();
    datasetWriter.close();
}

From source file:trainer.userinput.TrainingFileDB.java

public static CSVFormat getCSVFormat() {
    // Create the CSVFormat object with "\n" as a record delimiter
    CSVFormat csvFileFormat = CSVFormat.TDF.withRecordSeparator(NEW_LINE_SEPARATOR);
    csvFileFormat = csvFileFormat.withEscape('^');
    csvFileFormat = csvFileFormat.withQuoteMode(QuoteMode.NONE);
    return csvFileFormat;
}