List of usage examples for org.apache.hadoop.fs FileSystem createNewFile
public boolean createNewFile(Path f) throws IOException
From source file:org.apache.gobblin.data.management.trash.Trash.java
License:Apache License
protected void ensureTrashLocationExists(FileSystem fs, Path trashLocation) throws IOException { if (fs.exists(trashLocation)) { if (!fs.isDirectory(trashLocation)) { throw new IOException(String.format("Trash location %s is not a directory.", trashLocation)); }//from w w w .j a v a2 s.co m if (!fs.exists(new Path(trashLocation, TRASH_IDENTIFIER_FILE))) { // If trash identifier file is not present, directory might have been created by user. // Add trash identifier file only if directory is empty. if (fs.listStatus(trashLocation).length > 0) { throw new IOException(String.format( "Trash directory %s exists, but it does not look like a trash directory. " + "File: %s missing and directory is not empty.", trashLocation, TRASH_IDENTIFIER_FILE)); } else if (!fs.createNewFile(new Path(trashLocation, TRASH_IDENTIFIER_FILE))) { throw new IOException(String.format("Failed to create file %s in existing trash directory %s.", TRASH_IDENTIFIER_FILE, trashLocation)); } } } else if (!(safeFsMkdir(fs, trashLocation.getParent(), ALL_PERM) && safeFsMkdir(fs, trashLocation, PERM) && fs.createNewFile(new Path(trashLocation, TRASH_IDENTIFIER_FILE)))) { // Failed to create directory or create trash identifier file. throw new IOException("Failed to create trash directory at " + trashLocation.toString()); } }
From source file:org.apache.hama.ml.ann.TestSmallLayeredNeuralNetwork.java
License:Apache License
public void testLogisticRegressionDistributedVersion() { // write data into a sequence file String tmpStrDatasetPath = "/tmp/logistic_regression_data"; Path tmpDatasetPath = new Path(tmpStrDatasetPath); String strDataPath = "src/test/resources/logistic_regression_data.txt"; String modelPath = "/tmp/logistic-regression-distributed-model"; Configuration conf = new Configuration(); List<double[]> instanceList = new ArrayList<double[]>(); List<double[]> trainingInstances = null; List<double[]> testInstances = null; try {// w ww . j a v a 2 s . c om FileSystem fs = FileSystem.get(new URI(tmpStrDatasetPath), conf); fs.delete(tmpDatasetPath, true); if (fs.exists(tmpDatasetPath)) { fs.createNewFile(tmpDatasetPath); } BufferedReader br = new BufferedReader(new FileReader(strDataPath)); String line = null; int count = 0; while ((line = br.readLine()) != null) { String[] tokens = line.trim().split(","); double[] instance = new double[tokens.length]; for (int i = 0; i < tokens.length; ++i) { instance[i] = Double.parseDouble(tokens[i]); } instanceList.add(instance); } br.close(); zeroOneNormalization(instanceList, instanceList.get(0).length - 1); // write training data to temporal sequence file SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, tmpDatasetPath, LongWritable.class, VectorWritable.class); int testSize = 150; Collections.shuffle(instanceList); testInstances = new ArrayList<double[]>(); testInstances.addAll(instanceList.subList(instanceList.size() - testSize, instanceList.size())); trainingInstances = instanceList.subList(0, instanceList.size() - testSize); for (double[] instance : trainingInstances) { DoubleVector vec = new DenseDoubleVector(instance); writer.append(new LongWritable(count++), new VectorWritable(vec)); } writer.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (URISyntaxException e) { e.printStackTrace(); } // create model int dimension = 8; SmallLayeredNeuralNetwork ann = new SmallLayeredNeuralNetwork(); ann.setLearningRate(0.7); ann.setMomemtumWeight(0.5); ann.setRegularizationWeight(0.1); ann.addLayer(dimension, false, FunctionFactory.createDoubleFunction("Sigmoid")); ann.addLayer(dimension, false, FunctionFactory.createDoubleFunction("Sigmoid")); ann.addLayer(dimension, false, FunctionFactory.createDoubleFunction("Sigmoid")); ann.addLayer(1, true, FunctionFactory.createDoubleFunction("Sigmoid")); ann.setCostFunction(FunctionFactory.createDoubleDoubleFunction("CrossEntropy")); ann.setModelPath(modelPath); long start = new Date().getTime(); Map<String, String> trainingParameters = new HashMap<String, String>(); trainingParameters.put("tasks", "5"); trainingParameters.put("training.max.iterations", "2000"); trainingParameters.put("training.batch.size", "300"); trainingParameters.put("convergence.check.interval", "1000"); ann.train(tmpDatasetPath, trainingParameters); long end = new Date().getTime(); // validate results double errorRate = 0; // calculate the error on test instance for (double[] testInstance : testInstances) { DoubleVector instance = new DenseDoubleVector(testInstance); double expected = instance.get(instance.getDimension() - 1); instance = instance.slice(instance.getDimension() - 1); double actual = ann.getOutput(instance).get(0); if (actual < 0.5 && expected >= 0.5 || actual >= 0.5 && expected < 0.5) { ++errorRate; } } errorRate /= testInstances.size(); Log.info(String.format("Training time: %fs\n", (double) (end - start) / 1000)); Log.info(String.format("Relative error: %f%%\n", errorRate * 100)); }
From source file:org.apache.hama.ml.ann.TestSmallLayeredNeuralNetwork.java
License:Apache License
public void testLogisticRegressionDistributedVersionWithFeatureTransformer() { // write data into a sequence file String tmpStrDatasetPath = "/tmp/logistic_regression_data_feature_transformer"; Path tmpDatasetPath = new Path(tmpStrDatasetPath); String strDataPath = "src/test/resources/logistic_regression_data.txt"; String modelPath = "/tmp/logistic-regression-distributed-model-feature-transformer"; Configuration conf = new Configuration(); List<double[]> instanceList = new ArrayList<double[]>(); List<double[]> trainingInstances = null; List<double[]> testInstances = null; try {//from w w w . j a v a2s .c om FileSystem fs = FileSystem.get(new URI(tmpStrDatasetPath), conf); fs.delete(tmpDatasetPath, true); if (fs.exists(tmpDatasetPath)) { fs.createNewFile(tmpDatasetPath); } BufferedReader br = new BufferedReader(new FileReader(strDataPath)); String line = null; int count = 0; while ((line = br.readLine()) != null) { String[] tokens = line.trim().split(","); double[] instance = new double[tokens.length]; for (int i = 0; i < tokens.length; ++i) { instance[i] = Double.parseDouble(tokens[i]); } instanceList.add(instance); } br.close(); zeroOneNormalization(instanceList, instanceList.get(0).length - 1); // write training data to temporal sequence file SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, tmpDatasetPath, LongWritable.class, VectorWritable.class); int testSize = 150; Collections.shuffle(instanceList); testInstances = new ArrayList<double[]>(); testInstances.addAll(instanceList.subList(instanceList.size() - testSize, instanceList.size())); trainingInstances = instanceList.subList(0, instanceList.size() - testSize); for (double[] instance : trainingInstances) { DoubleVector vec = new DenseDoubleVector(instance); writer.append(new LongWritable(count++), new VectorWritable(vec)); } writer.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (URISyntaxException e) { e.printStackTrace(); } // create model int dimension = 8; SmallLayeredNeuralNetwork ann = new SmallLayeredNeuralNetwork(); ann.setLearningRate(0.7); ann.setMomemtumWeight(0.5); ann.setRegularizationWeight(0.1); ann.addLayer(dimension, false, FunctionFactory.createDoubleFunction("Sigmoid")); ann.addLayer(dimension, false, FunctionFactory.createDoubleFunction("Sigmoid")); ann.addLayer(dimension, false, FunctionFactory.createDoubleFunction("Sigmoid")); ann.addLayer(1, true, FunctionFactory.createDoubleFunction("Sigmoid")); ann.setCostFunction(FunctionFactory.createDoubleDoubleFunction("CrossEntropy")); ann.setModelPath(modelPath); FeatureTransformer featureTransformer = new DefaultFeatureTransformer(); ann.setFeatureTransformer(featureTransformer); long start = new Date().getTime(); Map<String, String> trainingParameters = new HashMap<String, String>(); trainingParameters.put("tasks", "5"); trainingParameters.put("training.max.iterations", "2000"); trainingParameters.put("training.batch.size", "300"); trainingParameters.put("convergence.check.interval", "1000"); ann.train(tmpDatasetPath, trainingParameters); long end = new Date().getTime(); // validate results double errorRate = 0; // calculate the error on test instance for (double[] testInstance : testInstances) { DoubleVector instance = new DenseDoubleVector(testInstance); double expected = instance.get(instance.getDimension() - 1); instance = instance.slice(instance.getDimension() - 1); instance = featureTransformer.transform(instance); double actual = ann.getOutput(instance).get(0); if (actual < 0.5 && expected >= 0.5 || actual >= 0.5 && expected < 0.5) { ++errorRate; } } errorRate /= testInstances.size(); Log.info(String.format("Training time: %fs\n", (double) (end - start) / 1000)); Log.info(String.format("Relative error: %f%%\n", errorRate * 100)); }
From source file:org.apache.hama.ml.perception.TestSmallMultiLayerPerceptron.java
License:Apache License
/** * Test the XOR problem./*from w ww . ja v a 2 s . c o m*/ */ public void testTrainingByXOR() { // write in some training instances Configuration conf = new Configuration(); String strDataPath = "/tmp/xor-training-by-xor"; Path dataPath = new Path(strDataPath); // generate training data DoubleVector[] trainingData = new DenseDoubleVector[] { new DenseDoubleVector(new double[] { 0, 0, 0 }), new DenseDoubleVector(new double[] { 0, 1, 1 }), new DenseDoubleVector(new double[] { 1, 0, 1 }), new DenseDoubleVector(new double[] { 1, 1, 0 }) }; try { URI uri = new URI(strDataPath); FileSystem fs = FileSystem.get(uri, conf); fs.delete(dataPath, true); if (!fs.exists(dataPath)) { fs.createNewFile(dataPath); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, dataPath, LongWritable.class, VectorWritable.class); for (int i = 0; i < 1000; ++i) { VectorWritable vecWritable = new VectorWritable(trainingData[i % 4]); writer.append(new LongWritable(i), vecWritable); } writer.close(); } } catch (Exception e) { e.printStackTrace(); } // begin training String modelPath = "/tmp/xorModel-training-by-xor.data"; double learningRate = 0.6; double regularization = 0.02; // no regularization double momentum = 0.3; // no momentum String squashingFunctionName = "Tanh"; String costFunctionName = "SquaredError"; int[] layerSizeArray = new int[] { 2, 5, 1 }; SmallMultiLayerPerceptron mlp = new SmallMultiLayerPerceptron(learningRate, regularization, momentum, squashingFunctionName, costFunctionName, layerSizeArray); Map<String, String> trainingParams = new HashMap<String, String>(); trainingParams.put("training.iteration", "2000"); trainingParams.put("training.mode", "minibatch.gradient.descent"); trainingParams.put("training.batch.size", "100"); trainingParams.put("tasks", "3"); trainingParams.put("modelPath", modelPath); try { mlp.train(dataPath, trainingParams); } catch (Exception e) { e.printStackTrace(); } // test the model for (int i = 0; i < trainingData.length; ++i) { DenseDoubleVector testVec = (DenseDoubleVector) trainingData[i].slice(2); try { double expected = trainingData[i].toArray()[2]; double actual = mlp.output(testVec).toArray()[0]; if (expected < 0.5 && actual >= 0.5 || expected >= 0.5 && actual < 0.5) { Log.info("Neural network failes to lear the XOR."); } } catch (Exception e) { e.printStackTrace(); } } }
From source file:org.apache.hama.ml.perception.TestSmallMultiLayerPerceptron.java
License:Apache License
/** * Use transformer to extract the first half features of the original features. *//*from w ww . j ava2 s . c o m*/ public void testFeatureTransformer() { // write in some training instances Configuration conf = new Configuration(); String strDataPath = "/tmp/xor-training-by-xor"; Path dataPath = new Path(strDataPath); // generate training data DoubleVector[] trainingData = new DenseDoubleVector[] { new DenseDoubleVector(new double[] { 0, 0, 0 }), new DenseDoubleVector(new double[] { 0, 1, 1 }), new DenseDoubleVector(new double[] { 1, 0, 1 }), new DenseDoubleVector(new double[] { 1, 1, 0 }) }; try { URI uri = new URI(strDataPath); FileSystem fs = FileSystem.get(uri, conf); fs.delete(dataPath, true); if (!fs.exists(dataPath)) { fs.createNewFile(dataPath); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, dataPath, LongWritable.class, VectorWritable.class); for (int i = 0; i < 1000; ++i) { VectorWritable vecWritable = new VectorWritable(trainingData[i % 4]); writer.append(new LongWritable(i), vecWritable); } writer.close(); } } catch (Exception e) { e.printStackTrace(); } // begin training String modelPath = "/tmp/xorModel-training-by-xor.data"; double learningRate = 0.6; double regularization = 0.02; // no regularization double momentum = 0.3; // no momentum String squashingFunctionName = "Tanh"; String costFunctionName = "SquaredError"; int[] layerSizeArray = new int[] { 1, 5, 1 }; SmallMultiLayerPerceptron mlp = new SmallMultiLayerPerceptron(learningRate, regularization, momentum, squashingFunctionName, costFunctionName, layerSizeArray); mlp.setFeatureTransformer(new FeatureTransformer() { @Override public DoubleVector transform(DoubleVector originalFeatures) { return originalFeatures.sliceUnsafe(originalFeatures.getDimension() / 2); } }); Map<String, String> trainingParams = new HashMap<String, String>(); trainingParams.put("training.iteration", "2000"); trainingParams.put("training.mode", "minibatch.gradient.descent"); trainingParams.put("training.batch.size", "100"); trainingParams.put("tasks", "3"); trainingParams.put("modelPath", modelPath); try { mlp.train(dataPath, trainingParams); } catch (Exception e) { e.printStackTrace(); } }
From source file:org.apache.hama.ml.recommendation.cf.InputConverter.java
License:Apache License
/** * converting given inputs into compatible output and save * @param outputPath - output path of converted input * @return true if success//from w ww. j av a 2 s.c o m */ public boolean convert(String outputPath) { try { Configuration conf = new Configuration(); //URI outputUri = new URI(outputPath); Path outputDataPath = new Path(outputPath); FileSystem fs = FileSystem.get(conf); fs.delete(outputDataPath, true); fs.createNewFile(outputDataPath); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputDataPath, Text.class, VectorWritable.class); // inputPreferences writeToFile(inputPreferences, OnlineCF.Settings.DFLT_PREFERENCE_DELIM, preferencesParser, fs, writer); // user features writeToFile(inputUserFeatures, OnlineCF.Settings.DFLT_USER_DELIM, userFeatureParser, fs, writer); // item features writeToFile(inputItemFeatures, OnlineCF.Settings.DFLT_ITEM_DELIM, itemFeatureParser, fs, writer); writer.close(); return true; } catch (IOException e) { e.printStackTrace(); } return false; }
From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java
License:Apache License
protected static void WriteModelToDirectory(HmmModel model, Path modelPath, Configuration conf) throws IOException { int numHidden = model.getNrOfHiddenStates(); int numObserved = model.getNrOfOutputStates(); Matrix emissionMatrix = model.getEmissionMatrix(); Matrix transitionMatrix = model.getTransitionMatrix(); Vector initialProbability = model.getInitialProbabilities(); MapWritable initialDistributionMap = new MapWritable(); MapWritable transitionDistributionMap = new MapWritable(); MapWritable emissionDistributionMap = new MapWritable(); // delete the output directory HadoopUtil.delete(conf, modelPath);/*from w w w. j ava 2 s. c om*/ // create new file to store HMM FileSystem fs = FileSystem.get(modelPath.toUri(), conf); Path outFile = new Path(modelPath, "part-randomSeed"); boolean newFile = fs.createNewFile(outFile); if (newFile) { SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, MapWritable.class); try { // construct one MapWritable<IntWritable, DoubleWritable> object // and two MapWritable<Text, MapWritable<IntWritable, DoubleWritable >> objects for (int i = 0; i < numHidden; i++) { IntWritable initialDistributionKey = new IntWritable(i); DoubleWritable initialDistributionValue = new DoubleWritable(initialProbability.get(i)); log.info("BuildRandomModel Initial Distribution Map: State {} = {})", initialDistributionKey.get(), initialDistributionValue.get()); initialDistributionMap.put(initialDistributionKey, initialDistributionValue); Text transitionDistributionKey = new Text("TRANSIT_" + Integer.toString(i)); MapWritable transitionDistributionValue = new MapWritable(); for (int j = 0; j < numHidden; j++) { IntWritable transitionDistributionInnerKey = new IntWritable(j); DoubleWritable transitionDistributionInnerValue = new DoubleWritable( transitionMatrix.get(i, j)); log.info("BuildRandomModel Transition Distribution Map Inner: ({}, {}) = ({}, {})", new Object[] { i, j, transitionDistributionInnerKey.get(), transitionDistributionInnerValue.get() }); transitionDistributionValue.put(transitionDistributionInnerKey, transitionDistributionInnerValue); } transitionDistributionMap.put(transitionDistributionKey, transitionDistributionValue); Text emissionDistributionKey = new Text("EMIT_" + Integer.toString(i)); MapWritable emissionDistributionValue = new MapWritable(); for (int j = 0; j < numObserved; j++) { IntWritable emissionDistributionInnerKey = new IntWritable(j); DoubleWritable emissionDistributionInnerValue = new DoubleWritable( emissionMatrix.get(i, j)); log.info("BuildRandomModel Emission Distribution Map Inner: ({}, {}) = ({}, {})", new Object[] { i, j, emissionDistributionInnerKey.get(), emissionDistributionInnerValue.get() }); emissionDistributionValue.put(emissionDistributionInnerKey, emissionDistributionInnerValue); } emissionDistributionMap.put(emissionDistributionKey, emissionDistributionValue); } writer.append(new Text("INITIAL"), initialDistributionMap); log.info("Wrote random Initial Distribution Map to {}", outFile); for (MapWritable.Entry<Writable, Writable> transitionEntry : transitionDistributionMap.entrySet()) { log.info("Writing Transition Distribution Map Key, Value = ({}, {})", transitionEntry.getKey(), transitionEntry.getValue()); writer.append(transitionEntry.getKey(), transitionEntry.getValue()); } log.info("Wrote random Transition Distribution Map to {}", outFile); for (MapWritable.Entry<Writable, Writable> emissionEntry : emissionDistributionMap.entrySet()) { log.info("Writing Emission Distribution Map Key, Value = ({}, {})", emissionEntry.getKey(), emissionEntry.getValue()); writer.append(emissionEntry.getKey(), emissionEntry.getValue()); } log.info("Wrote random Emission Distribution Map to {}", outFile); } finally { Closeables.closeQuietly(writer); } } }
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java
License:Apache License
/** * Encodes a particular HmmModel as a Sequence File and write it to the specified location. * * @param model HmmModel to be encoded * @param modelPath Location to store the encoded model * @param conf Configuration object * @throws IOException// w ww . j a v a 2s . c o m */ protected static void writeModelToDirectory(HmmModel model, Path modelPath, Configuration conf) throws IOException { int numHidden = model.getNrOfHiddenStates(); int numObserved = model.getNrOfOutputStates(); Matrix emissionMatrix = model.getEmissionMatrix(); Matrix transitionMatrix = model.getTransitionMatrix(); Vector initialProbability = model.getInitialProbabilities(); MapWritable initialDistributionMap = new MapWritable(); MapWritable transitionDistributionMap = new MapWritable(); MapWritable emissionDistributionMap = new MapWritable(); // delete the output directory HadoopUtil.delete(conf, modelPath); // create new file to store HMM FileSystem fs = FileSystem.get(modelPath.toUri(), conf); Path outFile = new Path(modelPath, "part-randomSeed"); boolean newFile = fs.createNewFile(outFile); if (newFile) { SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, MapWritable.class); try { for (int i = 0; i < numHidden; i++) { IntWritable initialDistributionKey = new IntWritable(i); DoubleWritable initialDistributionValue = new DoubleWritable(initialProbability.get(i)); initialDistributionMap.put(initialDistributionKey, initialDistributionValue); Text transitionDistributionKey = new Text("TRANSIT_" + Integer.toString(i)); MapWritable transitionDistributionValue = new MapWritable(); for (int j = 0; j < numHidden; j++) { IntWritable transitionDistributionInnerKey = new IntWritable(j); DoubleWritable transitionDistributionInnerValue = new DoubleWritable( transitionMatrix.get(i, j)); transitionDistributionValue.put(transitionDistributionInnerKey, transitionDistributionInnerValue); } transitionDistributionMap.put(transitionDistributionKey, transitionDistributionValue); Text emissionDistributionKey = new Text("EMIT_" + Integer.toString(i)); MapWritable emissionDistributionValue = new MapWritable(); for (int j = 0; j < numObserved; j++) { IntWritable emissionDistributionInnerKey = new IntWritable(j); DoubleWritable emissionDistributionInnerValue = new DoubleWritable( emissionMatrix.get(i, j)); emissionDistributionValue.put(emissionDistributionInnerKey, emissionDistributionInnerValue); } emissionDistributionMap.put(emissionDistributionKey, emissionDistributionValue); } writer.append(new Text("INITIAL"), initialDistributionMap); log.info("Wrote random Initial Distribution Map to {}", outFile); for (MapWritable.Entry<Writable, Writable> transitionEntry : transitionDistributionMap.entrySet()) { writer.append(transitionEntry.getKey(), transitionEntry.getValue()); } log.info("Wrote random Transition Distribution Map to {}", outFile); for (MapWritable.Entry<Writable, Writable> emissionEntry : emissionDistributionMap.entrySet()) { writer.append(emissionEntry.getKey(), emissionEntry.getValue()); } log.info("Wrote random Emission Distribution Map to {}", outFile); } finally { Closeables.closeQuietly(writer); } } }
From source file:org.apache.mahout.clustering.kmeans.EigenSeedGenerator.java
License:Apache License
public static Path buildFromEigens(Configuration conf, Path input, Path output, int k, DistanceMeasure measure) throws IOException { // delete the output directory FileSystem fs = FileSystem.get(output.toUri(), conf); HadoopUtil.delete(conf, output);/*from w w w. j a v a 2s. c o m*/ Path outFile = new Path(output, "part-eigenSeed"); boolean newFile = fs.createNewFile(outFile); if (newFile) { Path inputPathPattern; if (fs.getFileStatus(input).isDir()) { inputPathPattern = new Path(input, "*"); } else { inputPathPattern = input; } FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter()); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, ClusterWritable.class); Map<Integer, Double> maxEigens = Maps.newHashMapWithExpectedSize(k); // store // max // value // of // each // column Map<Integer, Text> chosenTexts = Maps.newHashMapWithExpectedSize(k); Map<Integer, ClusterWritable> chosenClusters = Maps.newHashMapWithExpectedSize(k); for (FileStatus fileStatus : inputFiles) { if (!fileStatus.isDir()) { for (Pair<Writable, VectorWritable> record : new SequenceFileIterable<Writable, VectorWritable>( fileStatus.getPath(), true, conf)) { Writable key = record.getFirst(); VectorWritable value = record.getSecond(); for (Vector.Element e : value.get().nonZeroes()) { int index = e.index(); double v = Math.abs(e.get()); if (!maxEigens.containsKey(index) || v > maxEigens.get(index)) { maxEigens.put(index, v); Text newText = new Text(key.toString()); chosenTexts.put(index, newText); Kluster newCluster = new Kluster(value.get(), index, measure); newCluster.observe(value.get(), 1); ClusterWritable clusterWritable = new ClusterWritable(); clusterWritable.setValue(newCluster); chosenClusters.put(index, clusterWritable); } } } } } try { for (Integer key : maxEigens.keySet()) { writer.append(chosenTexts.get(key), chosenClusters.get(key)); } log.info("EigenSeedGenerator:: Wrote {} Klusters to {}", chosenTexts.size(), outFile); } finally { Closeables.close(writer, false); } } return outFile; }
From source file:org.apache.mahout.clustering.kmeans.RandomSeedGenerator.java
License:Apache License
public static Path buildRandom(Configuration conf, Path input, Path output, int k, DistanceMeasure measure, Long seed) throws IOException { Preconditions.checkArgument(k > 0, "Must be: k > 0, but k = " + k); // delete the output directory FileSystem fs = FileSystem.get(output.toUri(), conf); HadoopUtil.delete(conf, output);// w ww .j a v a 2 s . c o m Path outFile = new Path(output, "part-randomSeed"); boolean newFile = fs.createNewFile(outFile); if (newFile) { Path inputPathPattern; if (fs.getFileStatus(input).isDir()) { inputPathPattern = new Path(input, "*"); } else { inputPathPattern = input; } FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter()); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, ClusterWritable.class); Random random = (seed != null) ? RandomUtils.getRandom(seed) : RandomUtils.getRandom(); List<Text> chosenTexts = Lists.newArrayListWithCapacity(k); List<ClusterWritable> chosenClusters = Lists.newArrayListWithCapacity(k); int nextClusterId = 0; int index = 0; for (FileStatus fileStatus : inputFiles) { if (fileStatus.isDir()) { continue; } for (Pair<Writable, VectorWritable> record : new SequenceFileIterable<Writable, VectorWritable>( fileStatus.getPath(), true, conf)) { Writable key = record.getFirst(); VectorWritable value = record.getSecond(); Kluster newCluster = new Kluster(value.get(), nextClusterId++, measure); newCluster.observe(value.get(), 1); Text newText = new Text(key.toString()); int currentSize = chosenTexts.size(); if (currentSize < k) { chosenTexts.add(newText); ClusterWritable clusterWritable = new ClusterWritable(); clusterWritable.setValue(newCluster); chosenClusters.add(clusterWritable); } else { int j = random.nextInt(index); if (j < k) { chosenTexts.set(j, newText); ClusterWritable clusterWritable = new ClusterWritable(); clusterWritable.setValue(newCluster); chosenClusters.set(j, clusterWritable); } } index++; } } try { for (int i = 0; i < chosenTexts.size(); i++) { writer.append(chosenTexts.get(i), chosenClusters.get(i)); } log.info("Wrote {} Klusters to {}", k, outFile); } finally { Closeables.close(writer, false); } } return outFile; }