Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package com.imolinfo.offline; import com.imolinfo.model.Document; import com.imolinfo.plug.iface.DocumentProvider; import com.imolinfo.plug.iface.DocumentToLabeledPoint; import com.imolinfo.plug.impl.DocumentStandardCleaner; import com.imolinfo.plug.impl.DocumentToTFIDFLabeledPoint; import com.imolinfo.util.GlobalVariable; import com.imolinfo.plug.clm.SVMOneVsAll; import com.imolinfo.util.TestUtils; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectOutputStream; import java.util.Properties; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.classification.NaiveBayes; import org.apache.spark.mllib.classification.NaiveBayesModel; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.rdd.RDD; /** * * @author renzo */ public class InModelValidation { public static void main(String[] args) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { SparkConf conf = new SparkConf().setAppName("Train"); final JavaSparkContext jsc = new JavaSparkContext(conf); Properties p = new Properties(); p.load(new FileInputStream("runtime.properties")); GlobalVariable.getInstance().setProperties(p); invokePipeline(jsc); } public static void invokePipeline(JavaSparkContext jsc) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { /* Properties prop = GlobalVariable.getInstance().getProperties(); DocumentProvider tp = (DocumentProvider) Class.forName(prop.getProperty("sourceClass")).newInstance(); JavaRDD<Document> inputD = tp.getTextFromDs(jsc, prop.getProperty("trainingSet")); DocumentStandardCleaner tc = new DocumentStandardCleaner(); inputD = tc.cleanData(inputD); DocumentToLabeledPoint tl = new DocumentToTFIDFLabeledPoint(); inputD = tl.vectorize(inputD); JavaRDD<LabeledPoint> features = tl.convert(inputD); RDD<LabeledPoint> featureData = features.rdd(); FileUtils.deleteDirectory(new File(prop.getProperty("outputModelPath"))); File idfFile = new File(prop.getProperty("idfModelPath")); idfFile.mkdirs(); FileOutputStream fos = new FileOutputStream(prop.getProperty("idfModelFile")); ObjectOutputStream oos = new ObjectOutputStream(fos); oos.writeObject(tl.getIDFModel()); featureData.cache(); features.cache(); NaiveBayesModel nbModel = NaiveBayes.train(featureData, 1); //nbModel.save(jsc.sc(), prop.getProperty("nbPath")); SVMOneVsAll svmModel = SVMOneVsAll.train(jsc,features); //svmModel.save(jsc, prop.getProperty("svmOAPath")); TestUtils.analyze(features, nbModel); String nbResult=TestUtils.printStats("NAIVE BAYES"); TestUtils.analyze(features, svmModel); String svmResult=TestUtils.printStats("SVM"); System.out.println(nbResult); System.out.println(svmResult); */ } }