List of usage examples for opennlp.tools.util TrainingParameters getSettings
public Map<String, String> getSettings()
From source file:edu.usc.irds.agepredictor.spark.authorage.AgePredictSGDTrainer.java
public static AgePredictModel createModel(String languageCode, SparkSession spark, String eventDir, AgeClassifyContextGeneratorWrapper wrapper, TrainingParameters trainParams) throws IOException { Map<String, String> params = trainParams.getSettings(); int cutoff = getCutoff(params); int iterations = getIterations(params); JavaRDD<String> data = spark.sparkContext().textFile(eventDir, 24).toJavaRDD().cache(); JavaRDD<Row> samples = data.map(new Function<String, Row>() { public Row call(String s) { if (s == null) { return null; }/*from ww w .j a v a 2 s . c o m*/ String[] parts = s.split(","); if (parts.length != 3) { return null; } try { if (parts[0] != "-1") { Integer value = Integer.parseInt(parts[0]); String[] text = parts[2].split(" "); //add in the category as another feature List<String> tokens = new ArrayList<String>(Arrays.asList(text)); for (int i = 0; i < text.length / 18; i++) { tokens.add("cat=" + parts[1]); } //System.out.println("Event:" + value + "," + Arrays.toString(tokens.toArray())); return RowFactory.create(value, tokens.toArray()); } else { return null; } } catch (Exception e) { return null; } } }).cache(); JavaRDD<Row> validSamples = samples.filter(new Function<Row, Boolean>() { @Override public Boolean call(Row s) { return s != null; } }).cache(); samples.unpersist(); StructType schema = new StructType(new StructField[] { new StructField("value", DataTypes.IntegerType, false, Metadata.empty()), new StructField("context", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(validSamples, schema).cache(); CountVectorizerModel cvm = new CountVectorizer().setInputCol("context").setOutputCol("feature") .setMinDF(cutoff).fit(df); Normalizer normalizer = new Normalizer().setInputCol("feature").setOutputCol("normFeature").setP(1.0); Dataset<Row> eventDF = cvm.transform(df).select("value", "feature"); //System.out.println("Vocab: " + cvm.vocabulary().length + "," + Arrays.toString(cvm.vocabulary())); Dataset<Row> normDF = normalizer.transform(eventDF).select("value", "normFeature"); JavaRDD<Row> events = normDF.javaRDD().cache(); eventDF.unpersist(); normDF.unpersist(); JavaRDD<LabeledPoint> parsedData = events.map(new Function<Row, LabeledPoint>() { public LabeledPoint call(Row r) { Integer val = r.getInt(0); SparseVector vec = (SparseVector) r.get(1); Vector features = Vectors.sparse(vec.size(), vec.indices(), vec.values()); return new LabeledPoint(val, features); } }).cache(); double stepSize = getStepSize(params); double regParam = getReg(params); LassoWithSGD algorithm = (LassoWithSGD) new LassoWithSGD().setIntercept(true); algorithm.optimizer().setNumIterations(iterations).setStepSize(stepSize).setRegParam(regParam); final LassoModel model = algorithm.run(JavaRDD.toRDD(parsedData)); System.out.println("Coefficients: " + Arrays.toString(model.weights().toArray())); System.out.println("Intercept: " + model.intercept()); // Evaluate model on training examples and compute training error JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData .map(new Function<LabeledPoint, Tuple2<Double, Double>>() { public Tuple2<Double, Double> call(LabeledPoint point) { double prediction = model.predict(point.features()); System.out.println(prediction + "," + point.label()); return new Tuple2<>(prediction, point.label()); } }).cache(); double MAE = new JavaDoubleRDD(valuesAndPreds.map(new Function<Tuple2<Double, Double>, Object>() { public Object call(Tuple2<Double, Double> pair) { return Math.abs(pair._1() - pair._2()); } }).rdd()).mean(); JavaRDD<Vector> vectors = valuesAndPreds.map(new Function<Tuple2<Double, Double>, Vector>() { public Vector call(Tuple2<Double, Double> pair) { return Vectors.dense(pair._1(), pair._2()); } }); Matrix correlMatrix = Statistics.corr(vectors.rdd(), "pearson"); System.out.println("Training Mean Absolute Error: " + MAE); System.out.println("Correlation:\n" + correlMatrix.toString()); Map<String, String> manifestInfoEntries = new HashMap<String, String>(); return new AgePredictModel(languageCode, model, cvm.vocabulary(), wrapper); }
From source file:es.ehu.si.ixa.pipe.nerc.train.InputOutputUtils.java
private static TrainingParameters loadTrainingParameters(String paramFile, boolean supportSequenceTraining) { TrainingParameters params = null; if (paramFile != null) { checkInputFile("Training Parameter", new File(paramFile)); InputStream paramsIn = null; try {// w ww .jav a 2 s . c om paramsIn = new FileInputStream(new File(paramFile)); params = new opennlp.tools.util.TrainingParameters(paramsIn); } catch (IOException e) { throw new TerminateToolException(-1, "Error during parameters loading: " + e.getMessage(), e); } finally { try { if (paramsIn != null) paramsIn.close(); } catch (IOException e) { // sorry that this can fail } } if (!TrainUtil.isValid(params.getSettings())) { throw new TerminateToolException(1, "Training parameters file '" + paramFile + "' is invalid!"); } if (!supportSequenceTraining && TrainUtil.isSequenceTraining(params.getSettings())) { throw new TerminateToolException(1, "Sequence training is not supported!"); } } return params; }