List of usage examples for org.apache.commons.csv CSVRecord get
public String get(final String name)
From source file:norbert.mynemo.dataimport.scraping.CkMapping.java
/** * Creates a mapping from a record. The record was usually created from a parser created by the * {@link #createParser(String)} method. *///from w w w .j a v a 2s .co m public static CkMapping createMapping(CSVRecord record) { return new CkMapping(record.get(CK_MOVIE_HEADER), record.get(IMDB_MOVIE_HEADER)); }
From source file:norbert.mynemo.dataimport.scraping.CkRating.java
/** * Creates a rating from a record. The record was usually created from a parser created by the * {@link #createParser(String)} method. */// w w w .ja v a2s . c o m public static CkRating createRating(CSVRecord record) { return new CkRating(record.get(USER_HEADER), record.get(MOVIE_HEADER), record.get(VALUE_HEADER)); }
From source file:notaql.engines.csv.CSVEngineEvaluator.java
/** * Evaluates the given transformation./*w w w .ja v a2 s . co m*/ * * This first parses the document (with the first line being the header) and then evaluates on our framework. * * TODO: this assumes a header line. It might happen that it is not provided. * * @param transformation * @return */ @Override public JavaRDD<ObjectValue> evaluate(Transformation transformation) { final SparkTransformationEvaluator evaluator = new SparkTransformationEvaluator(transformation); final JavaSparkContext sc = NotaQL.SparkFactory.getSparkContext(); final CSVFormat format = CSVFormat.DEFAULT; final JavaRDD<String> csv = sc.textFile(path); final String first = csv.first(); final CSVRecord header; try { header = format.parse(new StringReader(first)).iterator().next(); } catch (IOException e) { e.printStackTrace(); throw new AssertionError("Header could not be read for some reason."); } String[] headerCols = new String[header.size()]; for (int i = 0; i < header.size(); i++) { headerCols[i] = header.get(i); } final CSVFormat headerFormat = CSVFormat.DEFAULT.withHeader(headerCols); final JavaRDD<CSVRecord> records = csv.filter(f -> !f.equals(first)) .map(line -> headerFormat.parse(new StringReader(line)).iterator().next()); final JavaRDD<Value> converted = records.map(ValueConverter::convertToNotaQL); final JavaRDD<Value> filtered = converted.filter(o -> transformation.satisfiesInPredicate((ObjectValue) o)); return evaluator.process(filtered); }
From source file:nz.ac.waikato.cms.supernova.gui.Supernova.java
/** * Generates the output of the "batch" tab. *///from w w w . j av a 2s. c om protected void generateBatchOutput() { String cls; AbstractOutputGenerator generator; int colID; int colMeasure; int colScore; int colPercentile; Reader reader; CSVParser csvparser; String oldID; Map<String, List<Double>> test; String id; File outfile; String msg; String measure; double score; double percentile; String error; m_BatchLog.setText(""); m_BatchGenerate.setEnabled(false); try { cls = AbstractOutputGenerator.class.getPackage().getName() + "." + m_SingleGenerator.getSelectedItem(); generator = (AbstractOutputGenerator) Class.forName(cls).newInstance(); } catch (Exception e) { batchLog("Failed to instantiate output generator - falling back on PNG", e); generator = new PNG(); } try { colID = 0; colMeasure = 1; colScore = 2; colPercentile = 3; reader = new FileReader(m_BatchCSV.getCurrent()); csvparser = new CSVParser(reader, CSVFormat.EXCEL.withHeader()); oldID = ""; test = new HashMap<>(); for (CSVRecord rec : csvparser) { if (rec.size() < 4) continue; id = rec.get(colID); if (!id.equals(oldID)) { if (!test.isEmpty()) { outfile = new File(m_BatchOutput.getCurrent() + File.separator + oldID + "." + generator.getExtension()); batchLog("Generating: " + outfile, false); batchLog("Using: " + test, false); msg = generator.generate(test, outfile); if (msg != null) { error = "Failed to generate output for ID: " + oldID; batchLog(error, true); } } test.clear(); oldID = id; } measure = rec.get(colMeasure); score = Double.parseDouble(rec.get(colScore)); percentile = Double.parseDouble(rec.get(colPercentile)); test.put(measure, new ArrayList<>(Arrays.asList(new Double[] { score, percentile }))); } if (!test.isEmpty()) { outfile = new File( m_BatchOutput.getCurrent() + File.separator + oldID + "." + generator.getExtension()); batchLog("Generating: " + outfile, false); batchLog("Using: " + test, false); msg = generator.generate(test, outfile); if (msg != null) { error = "Failed to generate output for ID: " + oldID; batchLog(error, true); } } } catch (Exception e) { batchLog("Failed to generate output!", e); } m_BatchGenerate.setEnabled(true); }
From source file:nz.ac.waikato.cms.supernova.SupernovaCSV.java
public static void main(String[] args) throws Exception { ArgumentParser parser;/* w w w . j a v a2s .c o m*/ parser = ArgumentParsers.newArgumentParser("I am supernova"); parser.description("Generates output according to 'I am supernova' by Keith Soo.\n" + "Loads scores/percentiles from a CSV file to generate multiple outputs at once.\n" + "Expected four columns (name of column is irrelevant):\n" + "- ID: the filename (excluding path and extension)\n" + "- Measure: the measure (" + MEASURE_LIST + ")\n" + "- Score: the score of the measure\n" + "- Percentile: the percentile of the measure\n" + "\n" + "Project homepage:\n" + "https://github.com/fracpete/i-am-supernova"); // colors parser.addArgument("--" + AbstractOutputGenerator.OPENNESS + COLOR_SUFFIX) .metavar(AbstractOutputGenerator.OPENNESS + COLOR_SUFFIX).type(String.class) .setDefault(ColorHelper.toHex(Color.ORANGE)) .help("The color for '" + AbstractOutputGenerator.OPENNESS + "' in hex format (e.g., " + ColorHelper.toHex(Color.ORANGE) + ")."); parser.addArgument("--" + AbstractOutputGenerator.EXTRAVERSION + COLOR_SUFFIX) .metavar(AbstractOutputGenerator.EXTRAVERSION + COLOR_SUFFIX).type(String.class) .setDefault(ColorHelper.toHex(Color.YELLOW)) .help("The color for '" + AbstractOutputGenerator.EXTRAVERSION + "' in hex format (e.g., " + ColorHelper.toHex(Color.YELLOW) + ")."); parser.addArgument("--" + AbstractOutputGenerator.AGREEABLENESS + COLOR_SUFFIX) .metavar(AbstractOutputGenerator.AGREEABLENESS + COLOR_SUFFIX).type(String.class) .setDefault(ColorHelper.toHex(Color.GREEN)) .help("The color for '" + AbstractOutputGenerator.AGREEABLENESS + "' in hex format (e.g., " + ColorHelper.toHex(Color.GREEN) + ")."); parser.addArgument("--" + AbstractOutputGenerator.CONSCIENTIOUSNESS + COLOR_SUFFIX) .metavar(AbstractOutputGenerator.CONSCIENTIOUSNESS + COLOR_SUFFIX).type(String.class) .setDefault(ColorHelper.toHex(Color.BLUE)) .help("The color for '" + AbstractOutputGenerator.CONSCIENTIOUSNESS + "' in hex format (e.g., " + ColorHelper.toHex(Color.BLUE) + ")."); parser.addArgument("--" + AbstractOutputGenerator.NEUROTICISM + COLOR_SUFFIX) .metavar(AbstractOutputGenerator.NEUROTICISM + COLOR_SUFFIX).type(String.class) .setDefault(ColorHelper.toHex(Color.RED)) .help("The color for '" + AbstractOutputGenerator.NEUROTICISM + "' in hex format (e.g., " + ColorHelper.toHex(Color.RED) + ")."); // other parameters parser.addArgument("--" + CSV).metavar(CSV).type(String.class).required(true) .help("The CSV file containing the scores/percentiles (header must be present)."); parser.addArgument("--" + ID).metavar(ID).type(Integer.class).setDefault(1) .help("The 1-based index of the column in the CSV file containing the ID for the output file."); parser.addArgument("--" + MEASURE).metavar(MEASURE).type(Integer.class).setDefault(2) .help("The 1-based index of the column in the CSV file containing the measure name.\n" + "Allowed values: " + MEASURE_LIST); parser.addArgument("--" + SCORE).metavar(SCORE).type(Integer.class).setDefault(3) .help("The 1-based index of the column in the CSV file containing the scores."); parser.addArgument("--" + PERCENTILE).metavar(PERCENTILE).type(Integer.class).setDefault(4) .help("The 1-based index of the column in the CSV file containing the percentiles."); parser.addArgument("--" + BACKGROUND).metavar(BACKGROUND).type(String.class) .setDefault(ColorHelper.toHex(Color.BLACK)).help("The background color."); parser.addArgument("--" + OPACITY).metavar(OPACITY).type(Double.class).setDefault(0.1) .help("The opacity (0-1)."); parser.addArgument("--" + MARGIN).metavar(MARGIN).type(Double.class).setDefault(0.2) .help("The margin in the output (0-1)."); parser.addArgument("--" + WIDTH).metavar(WIDTH).type(Integer.class).setDefault(2000) .help("The width of the output."); parser.addArgument("--" + HEIGHT).metavar(HEIGHT).type(Integer.class).setDefault(2000) .help("The height of the output."); parser.addArgument("--" + CENTER).metavar(CENTER).type(String.class).setDefault(Incenter.class.getName()) .help("The name of the algorithm for calculating the center of a triangle.\n" + "Available: " + Registry.toString(Registry.getCenters(), true)); parser.addArgument("--" + GENERATOR).metavar(GENERATOR).type(String.class).setDefault(PNG.class.getName()) .help("The name of the generator class to use.\n" + "Available: " + Registry.toString(Registry.getGenerators(), true)); parser.addArgument("--" + OUTPUT).metavar(OUTPUT).type(String.class) .help("The directory to store the output in."); parser.addArgument("--" + VERBOSE).metavar(VERBOSE).type(Boolean.class).action(Arguments.storeTrue()) .help("Whether to output logging information."); Namespace namespace; try { namespace = parser.parseArgs(args); } catch (Exception e) { if (!(e instanceof HelpScreenException)) parser.printHelp(); return; } // colors Map<String, Color> colors = new HashMap<>(); colors.put(AbstractOutputGenerator.OPENNESS, ColorHelper .valueOf(namespace.getString(AbstractOutputGenerator.OPENNESS + COLOR_SUFFIX), Color.ORANGE)); colors.put(AbstractOutputGenerator.EXTRAVERSION, ColorHelper .valueOf(namespace.getString(AbstractOutputGenerator.EXTRAVERSION + COLOR_SUFFIX), Color.YELLOW)); colors.put(AbstractOutputGenerator.AGREEABLENESS, ColorHelper .valueOf(namespace.getString(AbstractOutputGenerator.AGREEABLENESS + COLOR_SUFFIX), Color.GREEN)); colors.put(AbstractOutputGenerator.CONSCIENTIOUSNESS, ColorHelper.valueOf( namespace.getString(AbstractOutputGenerator.CONSCIENTIOUSNESS + COLOR_SUFFIX), Color.BLUE)); colors.put(AbstractOutputGenerator.NEUROTICISM, ColorHelper .valueOf(namespace.getString(AbstractOutputGenerator.NEUROTICISM + COLOR_SUFFIX), Color.RED)); File outdir = new File(namespace.getString(OUTPUT)); String centerCls = namespace.getString(CENTER); if (!centerCls.contains(".")) centerCls = AbstractTriangleCenterCalculation.class.getPackage().getName() + "." + centerCls; String generatorCls = namespace.getString(GENERATOR); if (!generatorCls.contains(".")) generatorCls = AbstractOutputGenerator.class.getPackage().getName() + "." + generatorCls; AbstractOutputGenerator generator = (AbstractOutputGenerator) Class.forName(generatorCls).newInstance(); generator.setVerbose(namespace.getBoolean(VERBOSE)); generator.setColors(colors); generator.setBackground(ColorHelper.valueOf(namespace.getString(BACKGROUND), Color.BLACK)); generator.setOpacity(namespace.getDouble(OPACITY)); generator.setMargin(namespace.getDouble(MARGIN)); generator.setCenter((AbstractTriangleCenterCalculation) Class.forName(centerCls).newInstance()); if (generator instanceof AbstractOutputGeneratorWithDimensions) { AbstractOutputGeneratorWithDimensions pixel = (AbstractOutputGeneratorWithDimensions) generator; pixel.setWidth(namespace.getInt(WIDTH)); pixel.setHeight(namespace.getInt(HEIGHT)); } int colID = namespace.getInt(ID) - 1; int colMeasure = namespace.getInt(MEASURE) - 1; int colScore = namespace.getInt(SCORE) - 1; int colPercentile = namespace.getInt(PERCENTILE) - 1; Reader reader = new FileReader(namespace.getString(CSV)); CSVParser csvparser = new CSVParser(reader, CSVFormat.EXCEL.withHeader()); String oldID = ""; Map<String, List<Double>> test = new HashMap<>(); for (CSVRecord rec : csvparser) { if (rec.size() < 4) continue; String id = rec.get(colID); if (!id.equals(oldID)) { if (!test.isEmpty()) { File outfile = new File(outdir + File.separator + oldID + "." + generator.getExtension()); String msg = generator.generate(test, outfile); if (msg != null) System.err.println("Failed to generate output for ID: " + oldID); } test.clear(); oldID = id; } String measure = rec.get(colMeasure); double score = Double.parseDouble(rec.get(colScore)); double percentile = Double.parseDouble(rec.get(colPercentile)); test.put(measure, new ArrayList<>(Arrays.asList(new Double[] { score, percentile }))); } if (!test.isEmpty()) { File outfile = new File(outdir + File.separator + oldID + "." + generator.getExtension()); String msg = generator.generate(test, outfile); if (msg != null) System.err.println("Failed to generate output for ID: " + oldID); } }
From source file:nzilbb.agcsv.AgCsvDeserializer.java
/** * Loads the serialized form of the graph, using the given set of named streams. * @param streams A list of named streams that contain all the transcription/annotation data required. * @param schema The layer schema, definining layers and the way they interrelate. * @return A list of parameters that require setting before {@link IDeserializer#deserialize()} can be invoked. This may be an empty list, and may include parameters with the value already set to a workable default. If there are parameters, and user interaction is possible, then the user may be presented with an interface for setting/confirming these parameters, before they are then passed to {@link IDeserializer#setParameters(ParameterSet)}. * @throws SerializationException If the graph could not be loaded. * @throws IOException On IO error./* w ww .j a v a 2 s . co m*/ * @throws SerializerNotConfiguredException If the configuration is not sufficient for deserialization. */ @SuppressWarnings({ "rawtypes", "unchecked" }) public ParameterSet load(NamedStream[] streams, Schema schema) throws IOException, SerializationException, SerializerNotConfiguredException { if (getFieldDelimiter() == null) throw new SerializerNotConfiguredException("fieldDelimiter must be set."); ParameterSet parameters = new ParameterSet(); // take the first csv stream, ignore all others. NamedStream csv = Utility.FindSingleStream(streams, ".csv", "text/csv"); if (csv == null) throw new SerializationException("No CSV stream found"); setName(csv.getName()); setName(getName().replaceFirst("\\.csv$", "").replaceFirst("\\.ag$", "")); reset(); CSVParser parser = new CSVParser(new InputStreamReader(csv.getStream()), CSVFormat.EXCEL.withDelimiter(fieldDelimiter.charAt(0))); mDiscoveredLayers = new HashMap<String, Layer>(); Vector<CSVRecord> vRecords = new Vector<CSVRecord>(); mCsvData.put("anchor", vRecords); // start with anchors // read all the lines, and extract the layer names for (CSVRecord line : parser) { // does it have only one field? - the layer name if (line.get(0).equals("layer")) { Layer layer = new Layer(line.get(1), line.get(2), Integer.parseInt(line.get(5)), true, // peers false, // peersOverlap false, // saturated line.get(4).equals("W") ? schema.getWordLayerId() // parentId : line.get(4).equals("M") ? schema.getTurnLayerId() // parentId : line.get(4).equals("F") ? "graph" : "segments", // parentId true); // parentIncludes int layerId = Integer.parseInt(line.get(6)); if (layerId == 11) // turn { layer.setParentId(schema.getParticipantLayerId()); } else if (layerId == 12) // utterance { layer.setSaturated(true); } else if (layerId == 0) // transcription { layer.setParentId(schema.getTurnLayerId()); } else if (layerId == 2) // orthography { layer.setPeers(false); layer.setSaturated(true); } else if (layerId == 1) // segments { layer.setSaturated(true); } layer.put("@layer_id", layerId); layer.put("@type", line.get(3)); layer.put("@scope", line.get(4)); mDiscoveredLayers.put(line.get(1), layer); Parameter p = new Parameter(layer.getId(), Layer.class, layer.getId(), layer.getDescription(), true); p.setValue(schema.getLayer(layer.getId())); p.setPossibleValues(schema.getLayers().values()); parameters.addParameter(p); // start a new set of records vRecords = new Vector<CSVRecord>(); mCsvData.put(layer.getId(), vRecords); } vRecords.add(line); } // next line parser.close(); return parameters; }
From source file:nzilbb.agcsv.AgCsvDeserializer.java
/** * Deserializes the serialized data, generating one or more {@link Graph}s. * @return A list of valid (if incomplete) {@link Graph}s. * @throws SerializerNotConfiguredException if the object has not been configured. * @throws SerializationParametersMissingException if the parameters for this particular graph have not been set. * @throws SerializationException if errors occur during deserialization. *///from www .j a va 2s. com public Graph[] deserialize() throws SerializerNotConfiguredException, SerializationParametersMissingException, SerializationException { // if there are errors, accumlate as many as we can before throwing SerializationException SerializationException errors = null; Graph graph = new Graph(); graph.setId(getName()); // add layers to the graph // we don't just copy the whole schema, because that would imply that all the extra layers // contained no annotations, which is not necessarily true graph.addLayer((Layer) s.getParticipantLayer().clone()); graph.getSchema().setParticipantLayerId(s.getParticipantLayer().getId()); graph.addLayer((Layer) s.getTurnLayer().clone()); graph.getSchema().setTurnLayerId(s.getTurnLayer().getId()); graph.addLayer((Layer) s.getUtteranceLayer().clone()); graph.getSchema().setUtteranceLayerId(s.getUtteranceLayer().getId()); graph.addLayer((Layer) s.getWordLayer().clone()); graph.getSchema().setWordLayerId(s.getWordLayer().getId()); for (String layerId : mDiscoveredLayers.keySet()) { if (mDiscoveredLayers.get(layerId) != null) { graph.addLayer((Layer) mDiscoveredLayers.get(layerId).clone()); } } // next layer // anchors for (CSVRecord line : mCsvData.get("anchor")) { if (line.get(1).equals("offset")) continue; // skip header line Anchor anchor = new Anchor(line.get(0), new Double(line.get(1)), new Integer(line.get(2))); graph.addAnchor(anchor); if (line.size() > 3) { String comment = line.get(3); if (comment.length() > 0) { anchor.put("comment", comment); } } } // next anchor mCsvData.remove("anchor"); // layers for (String originalId : mCsvData.keySet()) { if (mDiscoveredLayers.get(originalId) != null) { // mapped to a schema layer try { readAnnotations(mCsvData.get(originalId), mDiscoveredLayers.get(originalId), graph); } catch (SerializationException exception) { if (errors == null) { errors = exception; } else { errors.addError(SerializationException.ErrorType.Other, exception.getMessage()); } } } // mapped to a schema layer } // next layer if (errors != null) throw errors; Graph[] graphs = { graph }; return graphs; }
From source file:nzilbb.agcsv.AgCsvDeserializer.java
/** * Create annotations from the given CSV rows. * @param lines CSV records./* w w w . ja v a 2s .c o m*/ * @param layer Layer for the annotations. * @param graph Graph to add the annotations to. * @throws SerializationException On error. */ public void readAnnotations(Vector<CSVRecord> lines, Layer layer, Graph graph) throws SerializationException { // map header columns HashMap<String, Integer> mHeadings = new HashMap<String, Integer>(); for (int c = 0; c < lines.elementAt(1).size(); c++) { String sHeader = lines.elementAt(1).get(c); if (sHeader.equalsIgnoreCase("id")) mHeadings.put("id", c); else if (sHeader.equalsIgnoreCase("startAnchor.id")) mHeadings.put("startAnchor.id", c); else if (sHeader.equalsIgnoreCase("endAnchor.id")) mHeadings.put("endAnchor.id", c); else if (sHeader.equalsIgnoreCase("label")) mHeadings.put("label", c); else if (sHeader.equalsIgnoreCase("labelStatus")) mHeadings.put("labelStatus", c); else if (sHeader.equalsIgnoreCase("turnAnnotationId")) mHeadings.put("turnAnnotationId", c); else if (sHeader.equalsIgnoreCase("ordinalInTurn")) mHeadings.put("ordinalInTurn", c); else if (sHeader.equalsIgnoreCase("wordAnnotationId")) mHeadings.put("wordAnnotationId", c); else if (sHeader.equalsIgnoreCase("ordinalInWord")) mHeadings.put("ordinalInWord", c); else if (sHeader.equalsIgnoreCase("segmentAnnotationId")) mHeadings.put("segmentAnnotationId", c); } // next header int highestHeaderIndex = 0; for (Integer i : mHeadings.values()) highestHeaderIndex = Math.max(highestHeaderIndex, i); mHeadings.put("comment", highestHeaderIndex + 1); for (int i = 2; i < lines.size(); i++) { CSVRecord line = lines.elementAt(i); Annotation annotation = new Annotation(line.get(mHeadings.get("id")), line.get(mHeadings.get("label")), layer.getId(), line.get(mHeadings.get("startAnchor.id")), line.get(mHeadings.get("endAnchor.id"))); annotation.setConfidence(new Integer(line.get(mHeadings.get("labelStatus")))); if (mHeadings.get("comment") < line.size()) { String comment = line.get(mHeadings.get("comment")); if (comment.length() > 0) { annotation.put("comment", comment); } } // parent if (layer.getParentId().equals("graph")) { annotation.setParentId(graph.getId()); } else if (layer.getParentId().equals(graph.getSchema().getTurnLayerId())) { if (layer.getId().equals(graph.getSchema().getUtteranceLayerId())) { // make sure turn exists Annotation turn = graph.getAnnotation(line.get(mHeadings.get("turnAnnotationId"))); if (turn == null) { // make sure participant exists Annotation participant = graph.getAnnotation(annotation.getLabel()); if (participant == null) { participant = new Annotation(annotation.getLabel(), annotation.getLabel(), graph.getSchema().getParticipantLayerId()); graph.addAnnotation(participant); } turn = new Annotation(line.get(mHeadings.get("turnAnnotationId")), annotation.getLabel(), graph.getSchema().getTurnLayerId(), // start/end IDs are set, but the anchor's themselves aren't added line.get(mHeadings.get("turnAnnotationId")) + " start", line.get(mHeadings.get("turnAnnotationId")) + " end", participant.getId()); graph.addAnnotation(turn); } // turn isn't there } // utterance layer annotation.setParentId(line.get(mHeadings.get("turnAnnotationId"))); } else if (layer.getParentId().equals(graph.getSchema().getWordLayerId())) { annotation.setParentId(line.get(mHeadings.get("wordAnnotationId"))); } else if (layer.getParentId().equals("segments")) { annotation.setParentId(line.get(mHeadings.get("segmentAnnotationId"))); } else if (layer.getId().equals(graph.getSchema().getTurnLayerId())) { // turn layer // make sure participant exists Annotation participant = graph.getAnnotation(annotation.getLabel()); if (participant == null) { participant = new Annotation(annotation.getLabel(), annotation.getLabel(), graph.getSchema().getParticipantLayerId()); graph.addAnnotation(participant); } annotation.setParentId(participant.getId()); } // ordinal if (layer.getId().equals(graph.getSchema().getWordLayerId())) { annotation.setOrdinal(Integer.parseInt(line.get(mHeadings.get("ordinalInTurn")))); } else if (layer.getId().equals("segments")) { annotation.setOrdinal(Integer.parseInt(line.get(mHeadings.get("ordinalInWord")))); } graph.addAnnotation(annotation); } }
From source file:nzilbb.csv.CsvDeserializer.java
/** * Deserializes the serialized data, generating one or more {@link Graph}s. * <p>Many data formats will only yield one graph (e.g. Transcriber * transcript or Praat textgrid), however there are formats that * are capable of storing multiple transcripts in the same file * (e.g. AGTK, Transana XML export), which is why this method * returns a list./*from w ww . j a va2 s.co m*/ * <p>This deserializer generates one graph per data row in the CSV file. * @return A list of valid (if incomplete) {@link Graph}s. * @throws SerializerNotConfiguredException if the object has not been configured. * @throws SerializationParametersMissingException if the parameters for this particular graph have not been set. * @throws SerializationException if errors occur during deserialization. */ public Graph[] deserialize() throws SerializerNotConfiguredException, SerializationParametersMissingException, SerializationException { if (participantLayer == null) throw new SerializerNotConfiguredException("Participant layer not set"); if (turnLayer == null) throw new SerializerNotConfiguredException("Turn layer not set"); if (utteranceLayer == null) throw new SerializerNotConfiguredException("Utterance layer not set"); if (wordLayer == null) throw new SerializerNotConfiguredException("Word layer not set"); if (schema == null) throw new SerializerNotConfiguredException("Layer schema not set"); validate(); String participantColumn = (String) parameters.get("who").getValue(); String textColumn = (String) parameters.get("text").getValue(); // if there are errors, accumlate as many as we can before throwing SerializationException SerializationException errors = null; Vector<Graph> graphs = new Vector<Graph>(); Iterator<CSVRecord> records = getParser().iterator(); while (records.hasNext()) { CSVRecord record = records.next(); Graph graph = new Graph(); if (parameters == null || parameters.get("id") == null || parameters.get("id").getValue() == null) { graph.setId(getName() + "-" + record.getRecordNumber()); } else { graph.setId(record.get((String) parameters.get("id").getValue())); } graph.setOffsetUnits(Constants.UNIT_CHARACTERS); // creat the 0 anchor to prevent graph tagging from creating one with no confidence Anchor firstAnchor = graph.getOrCreateAnchorAt(0.0, Constants.CONFIDENCE_MANUAL); Anchor lastAnchor = firstAnchor; // add layers to the graph // we don't just copy the whole schema, because that would imply that all the extra layers // contained no annotations, which is not necessarily true graph.addLayer((Layer) participantLayer.clone()); graph.getSchema().setParticipantLayerId(participantLayer.getId()); graph.addLayer((Layer) turnLayer.clone()); graph.getSchema().setTurnLayerId(turnLayer.getId()); graph.addLayer((Layer) utteranceLayer.clone()); graph.getSchema().setUtteranceLayerId(utteranceLayer.getId()); graph.addLayer((Layer) wordLayer.clone()); graph.getSchema().setWordLayerId(wordLayer.getId()); if (parameters != null) { for (Parameter p : parameters.values()) { if (p.getValue() instanceof Layer) { Layer layer = (Layer) p.getValue(); if (layer != null && graph.getLayer(layer.getId()) == null) { // haven't added this layer yet graph.addLayer((Layer) layer.clone()); } } } } // participant/author Annotation participant = graph.createTag(graph, schema.getParticipantLayerId(), record.get(participantColumn)); // meta-data for (String header : getHeaderMap().keySet()) { if (header.trim().length() == 0) continue; Parameter p = parameters.get("header_" + getHeaderMap().get(header)); if (p != null && p.getValue() != null) { Layer layer = (Layer) p.getValue(); String value = record.get(header); if (layer.getParentId().equals(schema.getRoot().getId())) // graph tag { graph.createTag(graph, layer.getId(), value); } else // participant tag { graph.createTag(participant, layer.getId(), value); } } // parameter set } // next header // text Annotation turn = new Annotation(null, participant.getLabel(), getTurnLayer().getId()); graph.addAnnotation(turn); turn.setParent(participant); turn.setStart(graph.getOrCreateAnchorAt(0.0, Constants.CONFIDENCE_MANUAL)); Annotation line = new Annotation(null, turn.getLabel(), getUtteranceLayer().getId()); line.setParentId(turn.getId()); line.setStart(turn.getStart()); int iLastPosition = 0; String sLine = record.get(textColumn).trim(); int iNumChars = sLine.length(); line = new Annotation(null, sLine, getUtteranceLayer().getId()); line.setParentId(turn.getId()); line.setStart(turn.getStart()); Anchor end = graph.getOrCreateAnchorAt(((double) iNumChars + 1), Constants.CONFIDENCE_MANUAL); line.setEnd(end); graph.addAnnotation(line); // ensure we have an utterance tokenizer if (getTokenizer() == null) { setTokenizer(new SimpleTokenizer(getUtteranceLayer().getId(), getWordLayer().getId())); } try { tokenizer.transform(graph); } catch (TransformationException exception) { if (errors == null) errors = new SerializationException(); if (errors.getCause() == null) errors.initCause(exception); errors.addError(SerializationException.ErrorType.Tokenization, exception.getMessage()); } graph.commit(); OrthographyClumper clumper = new OrthographyClumper(wordLayer.getId(), utteranceLayer.getId()); try { // clump non-orthographic 'words' with real words clumper.transform(graph); graph.commit(); } catch (TransformationException exception) { if (errors == null) errors = new SerializationException(); if (errors.getCause() == null) errors.initCause(exception); errors.addError(SerializationException.ErrorType.Tokenization, exception.getMessage()); } if (errors != null) throw errors; // set end anchors of graph tags for (Annotation a : graph.list(getParticipantLayer().getId())) { a.setStartId(firstAnchor.getId()); a.setEndId(lastAnchor.getId()); } graph.commit(); graphs.add(graph); } // next record return graphs.toArray(new Graph[0]); }
From source file:onlinenewspopularity.DataFormatter.java
/** * Reads the file and randomly populates the data * @return matrix list//www. j av a 2 s . com * The list has the following elements: * 1. List of features (mx1 ArrayList) * 2. Target column name * 3. Data for training (n1xm matrix) * 4. Target values for training data (n1x1 matrix) * 5. Test data (nxm matrix) * 6. Target values for test data (n2x2 matrix) * NOTE: n1 is the length of training data set. * n2 is the length of test data set. * n2 = Constants.SIZE*Constants.TEST_SET_RATIO * n1 = Constants.SIZE-n2 * @throws Exception */ public List<Matrix> readData() throws Exception { try { try (Reader br = new FileReader(new File(fileName))) { Iterable<CSVRecord> records = CSVFormat.DEFAULT.parse(br); List features = new ArrayList<>(); String predictColName; Iterator<CSVRecord> itr = records.iterator(); CSVRecord header = itr.next(); features.add(Constants.FEATURE_COL1_NAME); for (int i = Constants.INITIAL_FEATURE_INDEX; i < header.size() - 1; i++) { features.add(header.get(i).trim()); } predictColName = header.get((header.size() - 1)).trim(); trainStat = new double[2][features.size()]; double[][] data = new double[Constants.SIZE][features.size()]; double[][] res = new double[Constants.SIZE][1]; boolean[] validFeature = new boolean[features.size()]; int featureCount = 1; for (int i = 0; i < validFeature.length; i++) { validFeature[i] = Boolean.FALSE; //Not a valid feature by default } List indices = new ArrayList<>(); int n = Constants.SIZE; for (int i = 0; i < n; i++) { indices.add(i); } Random randGen = new Random(); validFeature[0] = Boolean.TRUE; //theta_0 is a valid feature int i = 0; for (CSVRecord record : records) { if (i < Constants.SIZE && !indices.isEmpty()) { int index = (int) indices.get(randGen.nextInt(indices.size())); for (int j = 0; j <= features.size(); j++) { if (j == 0) { data[index][j] = 1.0; } else if (j == features.size()) { res[index][0] = Double.parseDouble(record.get(record.size() - 1)); } else { data[index][j] = Double .parseDouble(record.get(j + Constants.INITIAL_FEATURE_INDEX - 1)); if (data[index][j] != 0) { if (validFeature[j] == Boolean.FALSE) { featureCount++; validFeature[j] = Boolean.TRUE; } } } } indices.remove((Object) index); } else { break; } i++; } //Remove empty features if (featureCount < features.size()) { List featuresCopy = new ArrayList<>(); featuresCopy.addAll(features); double[][] newData = new double[Constants.SIZE][featureCount]; int k = 0; int var = 0; for (int j = 0; j < featuresCopy.size(); j++) { if (validFeature[j] == Boolean.TRUE) { for (i = 0; i < Constants.SIZE; i++) { newData[i][k] = data[i][j]; } k++; } else { LOGGER.log(Level.INFO, "Removing empty feature: {0}", features.get(j - var)); features.remove(j - var); var++; } } data = newData; } int testLen = (int) (Constants.TEST_SET_RATIO * Constants.SIZE); int trainLen = Constants.SIZE - testLen; Matrix tmpx = new Matrix(data); Matrix tmpy = new Matrix(res); List temp = new ArrayList<>(); temp.add(features); temp.add(predictColName); temp.add(tmpx.getMatrix(0, trainLen - 1, 0, tmpx.getColumnDimension() - 1)); temp.add(tmpy.getMatrix(0, trainLen - 1, 0, tmpy.getColumnDimension() - 1)); temp.add(tmpx.getMatrix(trainLen, tmpx.getRowDimension() - 1, 0, tmpx.getColumnDimension() - 1)); temp.add(tmpy.getMatrix(trainLen, tmpy.getRowDimension() - 1, 0, tmpy.getColumnDimension() - 1)); return temp; } } catch (Exception e) { LOGGER.log(Level.WARNING, "{0}: {1}", new Object[] { e.getClass().getName(), e.getMessage() }); throw e; } }