List of usage examples for org.apache.commons.csv CSVRecord get
public String get(final String name)
From source file:de.inren.service.banking.BankDataServiceImpl.java
@Override public void importTransactionCsv(byte[] bytes) throws IOException { Iterable<CSVRecord> records = getIngDibaCsvFormat().parse(createReader(bytes)); Account account = new Account(); for (CSVRecord record : records) { switch ((int) record.getRecordNumber()) { case 1: // Umsatzanzeige break; case 2: // Kunde account.setOwner(record.get(1).trim()); break; case 3: // Konto String[] vals = record.get(1).split(":"); account.setName(vals[0].trim()); account.setNumber(vals[1].trim()); account = validateAccount(account); break; case 4: // break; case 5: // Zeitraum break; case 6: // Saldo break; case 7: // Leer break; case 8: // berschrift break; default: // Eintrag Transaction transaction = new Transaction(); transaction.setAccountNumber(account.getNumber().trim()); transaction.setAccountingDate(getDate(record.get(0))); transaction.setValutaDate(getDate(record.get(1))); transaction.setPrincipal(record.get(2).trim()); transaction.setAccountingText(record.get(3).trim()); transaction.setPurpose(record.get(4).trim()); transaction.setAmount(getBigDecimal(record.get(5))); transaction.setTransactionCurrency(record.get(6).trim()); transaction.setBalance(getBigDecimal(record.get(7))); transaction.setBalanceCurrency(record.get(8).trim()); transaction.setHashCode(transaction.createHashCode()); Transaction oldTransaction = transactionRepository.findByHashCode(transaction.getHashCode()); // only save new transactions if (oldTransaction == null) { transactionRepository.save(transaction); }//from www.jav a 2 s .c o m } } // Add the categories to the new (all) transactions. Should be // optimized. Iterable<Category> categories = categoryRepository.findAll(); for (Category category : categories) { applyCategoryToTransactions(category); } }
From source file:br.edimarmanica.trinity.intrasitemapping.auto.MappingController.java
private void reading() { /**/*w ww . j a v a2 s .co m*/ * Lendos os Run02.NR_SHARED_PAGES primeiros elementos de cada offset */ File dir = new File(Paths.PATH_TRINITY + site.getPath() + "/offset"); for (int nrOffset = 0; nrOffset < dir.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.endsWith(".csv"); } }).length; nrOffset++) { List<List<String>> offset = new ArrayList<>(); //cada arquivo um offset try (Reader in = new FileReader(dir.getAbsoluteFile() + "/result_" + nrOffset + ".csv")) { try (CSVParser parser = new CSVParser(in, CSVFormat.EXCEL)) { int nrRegistro = 0; for (CSVRecord record : parser) { if (nrRegistro >= Extract.NR_SHARED_PAGES) { break; } for (int nrRegra = 0; nrRegra < record.size(); nrRegra++) { if (nrRegistro == 0) { List<String> regra = new ArrayList<>(); try { regra.add(Preprocessing.filter(record.get(nrRegra))); } catch (InvalidValue ex) { regra.add(""); } offset.add(regra); } else { try { offset.get(nrRegra).add(Preprocessing.filter(record.get(nrRegra))); } catch (InvalidValue ex) { offset.get(nrRegra).add(""); } } } nrRegistro++; } } offsets.add(offset); } catch (FileNotFoundException ex) { Logger.getLogger(MappingController.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(MappingController.class.getName()).log(Level.SEVERE, null, ex); } } /** * Mostrando a leitura */ /*for (int i = 1; i < offsets.size(); i++) { for (int j = 0; j < 5; j++) { System.out.print(offsets.get(i).get(0).get(j) + " - "); } System.out.println(""); }*/ }
From source file:fr.univ_tours.li.mdjedaini.ideb.io.CsvLogLoader.java
/** * /*from w w w. j a v a 2 s .c om*/ * @param arg_sessionFilePath * @return */ public Session loadSession(String arg_sessionFilePath) { Session result = new Session(); try { Reader in = new FileReader(arg_sessionFilePath); Iterable<CSVRecord> records = CSVFormat.newFormat(';').withFirstRecordAsHeader().parse(in); // each record is a query for (CSVRecord record : records) { // System.out.println("I am parsing the line: " + record); String cubeName = record.get("cube"); EAB_Cube cube = this.be.getBenchmarkData().getInternalCubeList().get(cubeName); QueryTriplet q_tmp = new QueryTriplet(cube); // extract measures String currentMeasure = record.get("Measures"); // only add measure if not empty if (!currentMeasure.equals("[]")) { Pattern p = Pattern.compile("([a-zA-Z_0-9][a-zA-Z_0-9 ]+)"); Matcher m = p.matcher(currentMeasure); // manage multiple measures while (m.find()) { //System.out.println("Current measure: " + currentMeasure + " --- trouve: " + m.groupCount()); String measure = m.group(1); // System.out.println("Measure: " + measure); // add the current measure to the current query MeasureFragment mf = new MeasureFragment(q_tmp, measure); if (null == mf.getMeasure()) { int i = 2; } q_tmp.addMeasure(mf); } } // extract GBS String currentProjection = record.get("GroupBy"); // only add projections if not empty if (!currentProjection.equals("[]")) { Pattern p = Pattern.compile("([a-zA-Z_0-9][a-zA-Z_0-9 ]+)"); Matcher m = p.matcher(currentProjection); // manage multiple group by while (m.find()) { //System.out.println("Group " + i + ": " + m.group(i)); String level = m.group(1); // System.out.println("Level: " + level); EAB_Level l_tmp = cube.getLevelByAtomicName(level); ProjectionFragment pf_tmp = new ProjectionFragment(q_tmp, l_tmp); if (null == pf_tmp.getLevel()) { int i = 2; } q_tmp.addProjection(pf_tmp); } } // extract filters String currentSelection = record.get("Filters"); // only add projections if not empty if (!currentSelection.equals("[]")) { Pattern p = Pattern.compile("([a-zA-Z_0-9][a-zA-Z_0-9 ]+)=>\\[EQUAL ([a-zA-Z_0-9& ]+)\\]"); Matcher m = p.matcher(currentSelection); // manage multiple occurrences while (m.find()) { // System.out.println("Current selection: " + currentSelection + " --- trouve: " + m.groupCount()); String level = m.group(1); String member = m.group(2); EAB_Level l_tmp = cube.getLevelByAtomicName(level); // System.out.println("Cube: " + cubeName); // System.out.println("Level: " + level); // System.out.println("Member: " + member); if (null == l_tmp) { int i = 2; } String dimName = l_tmp.getHierarchy().getDimension().getMondrianDimension().getName(); String hieName = l_tmp.getHierarchy().getName(); //hieName.spl SelectionFragment sf_tmp = new SelectionFragment(q_tmp, dimName, hieName, level, member); if (null != sf_tmp.getMemberValue()) { q_tmp.addSelection(sf_tmp); } } } // add the query to the session result.addQuery(q_tmp); QueryConverter qc = new QueryConverter(this.be); try { System.out.println("******************"); System.out.println("Record:" + record); QueryMdx q_mdx = qc.toMdx(q_tmp); System.out.println("MDX with my converter:"); System.out.println(q_mdx); q_mdx.execute(Boolean.TRUE); // System.out.println("-----"); // System.out.println("Query: " + q_tmp); // System.out.println("-----"); // System.out.println("Mdx: " + qc.toMdx(q_tmp)); // System.out.println("******************"); } catch (Exception arg_e) { System.out.println("******************"); System.out.println("Exception: " + arg_e.getClass().getName()); System.out.println("Record:" + record); // System.out.println("-----"); // System.out.println("Query: " + q_tmp); // System.out.println("-----"); //qc.toMdx(q_tmp); //System.out.println("******************"); //System.err.println("Exception avec: "); //System.err.println("Record: " + record); } } // end foreach record } catch (Exception arg_e) { arg_e.printStackTrace(); } // SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss,SSS"); // // // add the name as metadata of the session // result.addMetaData("name", arg_sessionFilePath); // // System.out.println("I am parsing the file: " + arg_sessionFilePath); // // // pattern for extracting cube name // Pattern p = Pattern.compile("from \\[(.*?)\\].*"); // // File file = new File(arg_sessionFilePath); // // // try { // //BufferedReader br = new BufferedReader(new FileReader(file)); // BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(arg_sessionFilePath), "UTF-8")); // String line = null; // // String currentQuery = ""; // // // pour parser une requete, je cherche "select" // // je prends toutes les lignes suivantes, jusqu'a rencontrer une ligne vide... // while ((line = br.readLine()) != null) { // // if(line.contains("select")) { // // // look for the time before query execution // String date = line.substring(0, 23); // Date d = sdf.parse(date); // Long tsBefore = d.getTime(); // // // je recupere la position du mot "select" dans la ligne // Integer position = line.indexOf("select"); // currentQuery = line.substring(position, line.length()); // // String line_tmp = br.readLine(); // while(!line_tmp.equals("")) { // currentQuery += System.lineSeparator(); // //currentQuery += System.lineSeparator(); // currentQuery += line_tmp; // line_tmp = br.readLine(); // } // // // extract cubename from the query text // // Normally, the pattern is always found! // Matcher m = p.matcher(currentQuery); // m.find(); // String cubeName = m.group(1); // // //System.out.println(currentQuery); // //System.out.println("cubeName: " + cubeName); // //System.out.println("-------"); // // // look for the execution time // while(!line_tmp.contains("exec:")) { // line_tmp = br.readLine(); // } // // // here the line contains exec // // look for the time before query execution // date = line_tmp.substring(0, 23); // d = sdf.parse(date); // Long tsAfter = d.getTime(); // // Query q_tmp = new QueryMdx(this.be.getInternalCubeByName(cubeName), currentQuery); // // result.addQuery(q_tmp, tsBefore, tsAfter); // } // // } // // br.close(); // } catch(Exception arg_e) { // arg_e.printStackTrace(); // } return result; }
From source file:com.hurence.logisland.service.cache.CSVKeyValueCacheService.java
@Override // @OnEnabled/*from w w w .ja va 2s. co m*/ public void init(ControllerServiceInitializationContext context) throws InitializationException { super.init(context); try { if (context.getPropertyValue(DATABASE_FILE_URI).isSet()) { dbUri = context.getPropertyValue(DATABASE_FILE_URI).asString(); } if (context.getPropertyValue(DATABASE_FILE_PATH).isSet()) { dbPath = context.getPropertyValue(DATABASE_FILE_PATH).asString(); } if ((dbUri == null) && (dbPath == null)) { throw new Exception( "You must declare " + DATABASE_FILE_URI.getName() + " or " + DATABASE_FILE_PATH.getName()); } InputStream is = null; if (dbUri != null) { logger.info("opening csv database from hdfs : " + dbUri); is = initFromUri(dbUri); } if (dbPath != null) { logger.info("opening csv database from local fs : " + dbPath); is = initFromPath(context, dbPath); } if (is == null) { throw new InitializationException("Something went wrong while initializing csv db from " + DATABASE_FILE_URI.getName() + " or " + DATABASE_FILE_PATH.getName()); } // final Reader reader = new InputStreamReader(is); CSVFormat format = CSVFormat.DEFAULT; if (context.getPropertyValue(CSV_FORMAT).asString().equals(CSV_EXCEL.getValue())) { format = CSVFormat.EXCEL; } else if (context.getPropertyValue(CSV_FORMAT).asString().equals(CSV_EXCEL_FR.getValue())) { format = CSVFormat.EXCEL.withDelimiter(';'); } else if (context.getPropertyValue(CSV_FORMAT).asString().equals(CSV_MYSQL.getValue())) { format = CSVFormat.MYSQL; } else if (context.getPropertyValue(CSV_FORMAT).asString().equals(CSV_RFC4180.getValue())) { format = CSVFormat.RFC4180; } else if (context.getPropertyValue(CSV_FORMAT).asString().equals(CSV_TDF.getValue())) { format = CSVFormat.TDF; } if (context.getPropertyValue(CSV_HEADER).isSet()) { String[] columnNames = context.getPropertyValue(CSV_HEADER).asString().split(","); for (String name : columnNames) { headers.get().put(name, "string"); } format = format.withHeader(columnNames); } else if (context.getPropertyValue(FIRST_LINE_HEADER).isSet()) { format = format.withFirstRecordAsHeader(); } else { throw new InitializationException("unable to get headers from somewhere"); } Charset charset = Charset.forName("UTF-8"); if (context.getPropertyValue(ENCODING_CHARSET).isSet()) { String encoding = context.getPropertyValue(ENCODING_CHARSET).asString(); charset = Charset.forName(encoding); } rowKey = context.getPropertyValue(ROW_KEY).asString(); CSVParser parser = CSVParser.parse(is, charset, format); //new CSVParser(reader, format); /* * CSVParser parser = null; if (context.getPropertyValue(ENCODING_CHARSET).isSet()) { String encoding = context.getPropertyValue(ENCODING_CHARSET).asString(); parser = CSVParser.parse(reader, Charset.forName(encoding), format); } else { parser = CSVParser.parse(reader, format); } */ long count = 0; try { final Set<String> columnNames = parser.getHeaderMap().keySet(); for (final CSVRecord record : parser) { Record logislandRecord = new StandardRecord(); for (final String column : columnNames) { logislandRecord.setStringField(column, record.get(column)); } set(logislandRecord.getField(rowKey).asString(), logislandRecord); count++; } } finally { logger.info("successfully loaded " + count + " records from CSV file"); parser.close(); is.close(); } } catch (Exception e) { getLogger().error("Could not load database file: {}", new Object[] { e.getMessage() }); throw new InitializationException(e); } }
From source file:com.thinkbiganalytics.discovery.parsers.csv.CSVFileSchemaParser.java
private DefaultFileSchema populateSchema(CSVParser parser) { DefaultFileSchema fileSchema = new DefaultFileSchema(); int i = 0;//from w w w .j a v a2s . co m ArrayList<Field> fields = new ArrayList<>(); for (CSVRecord record : parser) { if (i > 9) { break; } int size = record.size(); for (int j = 0; j < size; j++) { DefaultField field = null; if (i == 0) { field = new DefaultField(); if (headerRow) { field.setName(record.get(j)); } else { field.setName("Col_" + (j + 1)); } fields.add(field); } else { try { field = (DefaultField) fields.get(j); field.getSampleValues().add(StringUtils.defaultString(record.get(j), "")); } catch (IndexOutOfBoundsException e) { LOG.warn("Sample file has potential sparse column problem at row [?] field [?]", i + 1, j + 1); } } } i++; } fileSchema.setFields(fields); return fileSchema; }
From source file:geovista.readers.csv.GeogCSVReader.java
public Object[] readFileStreaming(InputStream is, ArrayList<Integer> columns) { BufferedReader in = new BufferedReader(new InputStreamReader(is)); Iterable<CSVRecord> parser = null; try {// w w w . j a v a 2 s . c om parser = CSVFormat.DEFAULT.withDelimiter(this.currDelimiter).parse(in); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } int count = 0; for (CSVRecord rec : parser) { // eDays.add(rec.get(0)); // type.add(rec.get(10) + " - " + rec.get(8)); System.out.println(rec.get(0)); System.out.println(rec.toString()); count++; } // CSVParser shredder = new CSVParser() // CSVParser shredder = new CSVParser(is); // shredder.setCommentStart("#;!"); // shredder.setEscapes("nrtf", "\n\r\t\f"); String[] headers = null; String[] types = null; int[] dataTypes = null; String[][] fileContent = null; int dataBegin; Object[] data; try { // fileContent = shredder.getAllValues(); } catch (Exception ex) { ex.printStackTrace(); } types = fileContent[0];// first line tells us types dataTypes = new int[types.length]; int len; if (types[0].equalsIgnoreCase("int") || types[0].equalsIgnoreCase("double") || types[0].equalsIgnoreCase("string")) { dataBegin = 2; headers = fileContent[1]; data = new Object[headers.length + 1];// plus one for the headers // themselves len = fileContent.length - dataBegin; for (int i = 0; i < headers.length; i++) { if (types[i].equalsIgnoreCase("int")) { data[i + 1] = new int[len]; dataTypes[i] = GeogCSVReader.DATA_TYPE_INT; } else if (types[i].equalsIgnoreCase("double")) { data[i + 1] = new double[len]; dataTypes[i] = GeogCSVReader.DATA_TYPE_DOUBLE; } else if (types[i].equalsIgnoreCase("string")) { data[i + 1] = new String[len]; dataTypes[i] = GeogCSVReader.DATA_TYPE_STRING; } else { throw new IllegalArgumentException("GeogCSVReader.readFile, unknown type = " + types[i]); } } } else { dataBegin = 1; headers = fileContent[0]; data = new Object[headers.length + 1];// plus one for the headers // themselves len = fileContent.length - dataBegin; for (int i = 0; i < headers.length; i++) { String firstString = fileContent[1][i]; String secondString = fileContent[2][i]; String thirdString = fileContent[3][i]; String lastString = fileContent[fileContent[0].length][i]; if (isNumeric(firstString) && isNumeric(secondString) && isNumeric(thirdString) && isNumeric(lastString)) { if (isInt(fileContent, i) == false) { // if (isDouble(firstString) || isDouble(secondString) // || isDouble(thirdString) || isDouble(lastString)) { data[i + 1] = new double[len]; dataTypes[i] = GeogCSVReader.DATA_TYPE_DOUBLE; } else { data[i + 1] = new int[len]; dataTypes[i] = GeogCSVReader.DATA_TYPE_INT; } } else { data[i + 1] = new String[len]; dataTypes[i] = GeogCSVReader.DATA_TYPE_STRING; } } } data[0] = headers; String[] line = null; for (int row = dataBegin; row < len + dataBegin; row++) { line = fileContent[row]; int[] ints = null; double[] doubles = null; String[] strings = null; for (int column = 0; column < line.length; column++) { String item = line[column]; if (dataTypes[column] == GeogCSVReader.DATA_TYPE_INT) { if (Arrays.binarySearch(GeogCSVReader.NULL_STRINGS, item) >= 0) { ints = (int[]) data[column + 1]; ints[row - dataBegin] = GeogCSVReader.NULL_INT; } else { ints = (int[]) data[column + 1]; try { ints[row - dataBegin] = Integer.parseInt(item); } catch (NumberFormatException nfe) { logger.warning("could not parse " + item + " in column " + column); // nfe.printStackTrace(); ints[row - dataBegin] = GeogCSVReader.NULL_INT; } } } else if (dataTypes[column] == GeogCSVReader.DATA_TYPE_DOUBLE) { if (Arrays.binarySearch(GeogCSVReader.NULL_STRINGS, item) >= 0) { doubles = (double[]) data[column + 1]; doubles[row - dataBegin] = GeogCSVReader.NULL_DOUBLE; } else { doubles = (double[]) data[column + 1]; doubles[row - dataBegin] = parseDouble(item); } } else if (dataTypes[column] == GeogCSVReader.DATA_TYPE_STRING) { strings = (String[]) data[column + 1]; strings[row - dataBegin] = item; } else { throw new IllegalArgumentException("GeogCSVReader.readFile, unknown type = " + types[row]); } // end if } // next column } // next row return data; }
From source file:com.raceup.fsae.test.TesterGui.java
/** * Parses data file, builds a Test/*from www. j av a2 s .co m*/ * @param pathToDataFile path to data csv file */ private void parseDataFileAndCreateTestOrFail(String pathToDataFile) { ArrayList<Question> questions = new ArrayList<>(); CSVRecord[] rows = null; try { CSVParser parser = CSVFormat.DEFAULT.parse(new FileReader(pathToDataFile)); rows = parser.getRecords().toArray(new CSVRecord[parser.getRecords().size()]); } catch (Exception e) { System.err.println(e.toString()); } for (CSVRecord row : rows) { // each row represent a question ArrayList<Answer> answers = new ArrayList<>(); // list of answers if (row.size() > 1) { for (int i = 1; i < row.size(); i++) { if (row.get(i).length() > 0) { answers.add(new Answer(row.get(i))); } } Answer correctAnswer = answers.get(0); // the correct // answer is always the first one String questionText = row.get(0); questions.add( new Question(questionText, answers.toArray(new Answer[answers.size()]), correctAnswer)); // add to list of questions } } test = new Test(questions.toArray(new Question[questions.size()])); }
From source file:com.datascience.hadoop.CsvRecordReader.java
@Override public boolean next(LongWritable key, ListWritable<Text> value) throws IOException { value.clear();/*from w w w. ja v a 2 s. co m*/ try { if (iterator.hasNext()) { CSVRecord record = iterator.next(); position++; colLength = colLength == null ? record.size() : colLength; if ((!record.isConsistent() || record.size() != colLength) && strict) { String message = String.format("%s: %s", "inconsistent record at position", position); throw new CsvParseException(message); } key.set(record.getRecordNumber()); for (int i = 0; i < record.size(); i++) { String item = record.get(i); if (item == null) { value.add(null); } else { Text text = cache[i]; if (text == null) { text = new Text(); cache[i] = text; } text.set(item); value.add(text); } } //position = record.getCharacterPosition(); return true; } } catch (Exception e) { LOGGER.warn("failed to parse record at position: " + position); if (strict) { throw e; } else { return next(key, value); } } return false; }
From source file:ch.eitchnet.csvrestendpoint.marshaller.CsvDataToJsonMarshaller.java
/** * Returns true if the given {@link CSVRecord} is to be selected * /* w ww .j av a 2 s . co m*/ * @param headerMap * the map containing the column headers with the column index * * @param record * the {@link CSVRecord} to check * * @return true if the record is to be selected, false if not */ protected boolean isSelected(Map<String, Integer> headerMap, CSVRecord record) { if (StringHelper.isEmpty(this.query)) return true; if (this.queryFields.isEmpty()) { // iterate all possible fields and see if the query matches for (String value : record) { if (value.toLowerCase().contains(this.query)) return true; } } else { // iterate only the query fields for (String queryField : this.queryFields) { String value = record.get(queryField); if (value.toLowerCase().contains(this.query)) return true; } } return false; }
From source file:com.ibm.watson.developer_cloud.professor_languo.pipeline.QuestionSetManager.java
/** * This function is responsible for parsing a duplicate Stack Exchange thread TSV file produced by * {@link StackExchangeThreadSerializer}, and partitioning each such thread into the training set, * test set, or validation set. In addition, the corresponding row of the TSV file will be written * out to a training-, test-, or validation-set-specific TSV file in the same directory as the * input TSV file.//from w w w .j a va2 s . c o m * * @param dupQuestionFile - A TSV file containing duplicate {@link StackExchangeThread} records * @param trainTestValidateCumulativeProbs - A CDF of the desired proportion of training, test, * and validation set records * @throws PipelineException */ private void parseTsvAndPartitionRecords(File dupQuestionFile, double[] trainTestValidateCumulativeProbs) throws PipelineException { // Open the TSV file for parsing, and CSVPrinters for outputting train, // test, and validation set // TSV files String baseName = FilenameUtils.removeExtension(dupQuestionFile.getAbsolutePath()); String extension = FilenameUtils.getExtension(dupQuestionFile.getAbsolutePath()); try (FileReader reader = new FileReader(dupQuestionFile); CSVPrinter trainSetPrinter = new CSVPrinter( new FileWriter(baseName + StackExchangeConstants.DUP_THREAD_TSV_TRAIN_FILE_SUFFIX + FilenameUtils.EXTENSION_SEPARATOR + extension), CSVFormat.TDF.withHeader(CorpusBuilder.getTsvColumnHeaders())); CSVPrinter testSetPrinter = new CSVPrinter( new FileWriter(baseName + StackExchangeConstants.DUP_THREAD_TSV_TEST_FILE_SUFFIX + FilenameUtils.EXTENSION_SEPARATOR + extension), CSVFormat.TDF.withHeader(CorpusBuilder.getTsvColumnHeaders())); CSVPrinter validationSetPrinter = new CSVPrinter( new FileWriter(baseName + StackExchangeConstants.DUP_THREAD_TSV_VALIDATE_FILE_SUFFIX + FilenameUtils.EXTENSION_SEPARATOR + extension), CSVFormat.TDF.withHeader(CorpusBuilder.getTsvColumnHeaders()))) { // Parse the duplicate thread TSV file CSVParser parser = CSVFormat.TDF.withHeader().parse(reader); // Iterate over each CSV record, and place into a desired partition // (train, test, or // validation) Iterator<CSVRecord> recordIterator = parser.iterator(); while (recordIterator.hasNext()) { CSVRecord record = recordIterator.next(); // Get the StackExchangeThread associated with this record, and // create a question from it StackExchangeThread duplicateThread = StackExchangeThreadSerializer.deserializeThreadFromBinFile( record.get(CorpusBuilder.TSV_COL_HEADER_SERIALIZED_FILE_PATH)); StackExchangeQuestion duplicateQuestion = new StackExchangeQuestion(duplicateThread); String parentId = record.get(CorpusBuilder.TSV_COL_HEADER_PARENT_ID); // Now drop this question into a partition, and write it to a // corresponding TSV file double p = rng.nextDouble(); // Random number determines // partition for this record if (p <= trainTestValidateCumulativeProbs[0]) { // This record goes in the training set if (!addQuestionToSet(duplicateQuestion, parentId, this.trainingSet)) { throw new PipelineException( MessageFormat.format(Messages.getString("RetrieveAndRank.TRAINING_SET_FAILED_Q"), //$NON-NLS-1$ duplicateThread.getId())); } trainSetPrinter.printRecord((Object[]) convertRecordToArray(record)); } else if (p <= trainTestValidateCumulativeProbs[1]) { // This record goes in the test set if (!addQuestionToSet(duplicateQuestion, parentId, this.testSet)) { throw new PipelineException( MessageFormat.format(Messages.getString("RetrieveAndRank.TEST_SET_FAILED_Q"), //$NON-NLS-1$ duplicateThread.getId())); } testSetPrinter.printRecord((Object[]) convertRecordToArray(record)); } else { // This record goes in the validation set assert (p <= trainTestValidateCumulativeProbs[2]); if (!addQuestionToSet(duplicateQuestion, parentId, this.validationSet)) { throw new PipelineException( MessageFormat.format(Messages.getString("RetrieveAndRank.VALIDATION_SET_FAILED_Q"), //$NON-NLS-1$ duplicateThread.getId())); } validationSetPrinter.printRecord((Object[]) convertRecordToArray(record)); } } // Flush all the printers prior to closing trainSetPrinter.flush(); testSetPrinter.flush(); validationSetPrinter.flush(); } catch (IOException | IngestionException e) { throw new PipelineException(e); } }