List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileDirIterable SequenceFileDirIterable
public SequenceFileDirIterable(Path path, PathType pathType, PathFilter filter, Configuration conf)
From source file:org.pigml.classify.naivebayes.NaiveBayesModel.java
License:Apache License
public static NaiveBayesModel materialize(Path modelDir, Configuration conf) throws IOException { OpenIntDoubleHashMap weightsPerLabel = new OpenIntDoubleHashMap(); OpenIntDoubleHashMap weightsPerFeature = new OpenIntDoubleHashMap(); SequenceFileDirIterable<IntWritable, DoubleWritable> kvs; kvs = new SequenceFileDirIterable<IntWritable, DoubleWritable>(new Path(modelDir, "label_weights"), PathType.LIST, PathFilters.logsCRCFilter(), conf); for (Pair<IntWritable, DoubleWritable> kv : kvs) { weightsPerLabel.put(kv.getFirst().get(), kv.getSecond().get()); }/* w ww.j a va 2 s . c o m*/ kvs = new SequenceFileDirIterable<IntWritable, DoubleWritable>(new Path(modelDir, "feature_weights"), PathType.LIST, PathFilters.logsCRCFilter(), conf); for (Pair<IntWritable, DoubleWritable> kv : kvs) { weightsPerFeature.put(kv.getFirst().get(), kv.getSecond().get()); } Matrix weightsPerLabelAndFeature = null; SequenceFileDirIterable<IntWritable, VectorWritable> labelVectors = new SequenceFileDirIterable<IntWritable, VectorWritable>( new Path(modelDir, "label_feature_weights"), PathType.LIST, PathFilters.logsCRCFilter(), conf); for (Pair<IntWritable, VectorWritable> labelVector : labelVectors) { int label = labelVector.getFirst().get(); Vector vector = labelVector.getSecond().get(); if (weightsPerLabelAndFeature == null) { weightsPerLabelAndFeature = new SparseRowMatrix(weightsPerLabel.size(), vector.size()); } weightsPerLabelAndFeature.assignRow(label, vector); } // TODO alphaI is hard-coded to 1.0 // TODO perLabelThetaNormalizer is not supported yet NaiveBayesModel model = new NaiveBayesModel(weightsPerLabelAndFeature, weightsPerFeature, weightsPerLabel, 1.0f); model.validate(); return model; }
From source file:tk.summerway.mahout9.tools.MyClusterDumper.java
License:Apache License
public static Map<Integer, List<WeightedPropertyVectorWritable>> readPoints(Path pointsPathDir, long maxPointsPerCluster, Configuration conf) { Map<Integer, List<WeightedPropertyVectorWritable>> result = Maps.newTreeMap(); for (Pair<IntWritable, WeightedPropertyVectorWritable> record : new SequenceFileDirIterable<IntWritable, WeightedPropertyVectorWritable>( pointsPathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { // value is the cluster id as an int, key is the name/id of the // vector, but that doesn't matter because we only care about // printing it // String clusterId = value.toString(); int keyValue = record.getFirst().get(); List<WeightedPropertyVectorWritable> pointList = result.get(keyValue); if (pointList == null) { pointList = Lists.newArrayList(); result.put(keyValue, pointList); }/*from ww w . j a v a2s. co m*/ if (pointList.size() < maxPointsPerCluster) { pointList.add(record.getSecond()); } } return result; }