Java tutorial
/** * $RCSfile: Cluster.java * $Revision: 1.0 * $Date: 2015-6-22 * * Copyright (C) 2015 EastHope, Inc. All rights reserved. * * Use is subject to license terms. */ package hk.newsRecommender; import hk.newsRecommender.TFIDF.CustomKey; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.Set; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.SequenceFile.Writer; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.ReflectionUtils; import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable; import org.apache.mahout.clustering.classify.WeightedVectorWritable; import org.apache.mahout.clustering.conversion.InputDriver; import org.apache.mahout.clustering.kmeans.KMeansDriver; import org.apache.mahout.clustering.kmeans.RandomSeedGenerator; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.distance.EuclideanDistanceMeasure; import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.SequentialAccessSparseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import org.apache.mahout.utils.clustering.ClusterDumper; public class MatrixAndCluster { @SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String hdfsUrl = conf.get("fs.defaultFS"); // part1--------------------------------------------------------------- // Job job0 = Job.getInstance(conf, "siftKeywordsDimension"); // Path output1Path=new Path(hdfsUrl + "/data/recommend/matrix1"); // HadoopUtil.delete(conf, output1Path); // job0.setJarByClass(TFIDF.class); // job0.setMapperClass(Mapper_Part1.class); // job0.setReducerClass(Reduce_Part1.class); // job0.setMapOutputKeyClass(Text.class); // job0.setMapOutputValueClass(Text.class); // job0.setOutputKeyClass(Text.class); // job0.setOutputValueClass(Text.class); // job0.setPartitionerClass(CustomPartitioner.class); // FileInputFormat.addInputPath(job0, new Path(hdfsUrl + "/data/recommend/tfidf3")); // FileOutputFormat.setOutputPath(job0, output1Path); // job0.waitForCompletion(true); // part2--------------------------------------------------------------- // FileSystem fsopen = FileSystem.get(conf); // FSDataInputStream in = fsopen.open(new Path(hdfsUrl + "/data/recommend/matrix1/part-r-00000")); // Scanner scan = new Scanner(in); // List<String> keywordList=new ArrayList<String>(); // while (scan.hasNext()) { // keywordList.add(scan.next()); // } //// must before job // conf.setStrings("keyword", keywordList.toArray(new String[keywordList.size()])); // Job job1 = Job.getInstance(conf, "generateMatrix"); // Path output2Path=new Path(hdfsUrl + "/data/recommend/matrix2"); // HadoopUtil.delete(conf, output2Path); // job1.setJarByClass(TFIDF.class); // job1.setMapperClass(Mapper_Part2.class); // job1.setReducerClass(Reduce_Part2.class); // job1.setMapOutputKeyClass(Text.class); // job1.setMapOutputValueClass(Text.class); // job1.setOutputKeyClass(Text.class); // job1.setOutputValueClass(NullWritable.class); //// job1.addCacheFile(new Path("/data/recommend/matrix1/part-r-00000").toUri()); // FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/tfidf3")); // FileOutputFormat.setOutputPath(job1, output2Path); // job1.waitForCompletion(true); // part3-------------------??-------------------------------------------- Path output3Path = new Path(hdfsUrl + "/data/recommend/cluster2"); HadoopUtil.delete(conf, output3Path); EuclideanDistanceMeasure measure = new EuclideanDistanceMeasure(); Path clusterInput = new Path(hdfsUrl + "/data/recommend/matrix2"); Path clusterSeqInput = new Path(hdfsUrl + "/data/recommend/cluster1"); Path clusterOutput = new Path(hdfsUrl + "/data/recommend/cluster2"); int k = 10; int maxIter = 3; // ?mahout??? // InputDriver.runJob(clusterInput, clusterSeqInput, "org.apache.mahout.math.RandomAccessSparseVector"); // ?k Path clusters = RandomSeedGenerator.buildRandom(conf, clusterSeqInput, new Path(clusterOutput, "clusters-0"), k, measure); KMeansDriver.run(conf, clusterSeqInput, clusters, clusterOutput, 0.01, maxIter, true, 0.0, false); // ClusterDumper printClusters ??? ClusterDumper clusterDumper = new ClusterDumper(new Path(clusterOutput, "clusters-" + (maxIter - 1)), new Path(clusterOutput, "clusteredPoints")); clusterDumper.printClusters(null); clusterOutput(conf, new Path(hdfsUrl + "/data/recommend/cluster2/clusteredPoints/part-m-00000")); // clusterOutput2(conf0,new Path(hdfsUrl0 + "/data/recommend/cluster2/clusteredPoints/part-m-00000")); // matrix2Vector(conf0,new Path(hdfsUrl0 + "/data/recommend/cluster1/part-m-00000"));// } // part1---------------?--------------------------------------------------------- public static class Mapper_Part1 extends Mapper<LongWritable, Text, Text, Text> { public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] lineSplits = value.toString().split("\t"); String keyword = lineSplits[1].split(" ")[0]; context.write(new Text(keyword), new Text("")); } } public static class Reduce_Part1 extends Reducer<Text, Text, Text, Text> { public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { context.write(key, new Text("")); } } public static class CustomPartitioner<K1, V1> extends Partitioner<K1, V1> { @Override public int getPartition(K1 key, V1 value, int numPartitions) { CustomKey keyK = (CustomKey) key; Text tmpValue = new Text(keyK.getSymbol()); return (tmpValue.hashCode() & Integer.MAX_VALUE) % numPartitions; } } // part2---------------??--------------------------------------------------------- public static class Mapper_Part2 extends Mapper<LongWritable, Text, Text, Text> { public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] lineSplits = value.toString().split("\t"); context.write(new Text(lineSplits[0]), new Text(lineSplits[1])); } } public static class Reduce_Part2 extends Reducer<Text, Text, Text, NullWritable> { public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); String[] keywords = conf.getStrings("keyword"); List<String> keywordsList = Arrays.asList(keywords); String newsID = key.toString().split("\\|")[0]; long newsIDKeywordsCount = Long.parseLong(key.toString().split("\\|")[1]); String publishTime = key.toString().split("\\|")[2]; StringBuilder sb = new StringBuilder(newsID + "|" + publishTime); // StringBuilder sb=new StringBuilder(); Map<String, String> keywordMap = new HashMap<String, String>(); for (Text value : values) { keywordMap.put(value.toString().split(" ")[0], value.toString().split(" ")[1]); } for (String str : keywordsList) { if (!keywordMap.containsKey(str)) sb.append(" ").append(0); else { double probability = Double.parseDouble(keywordMap.get(str)); sb.append(" ").append(Math.round(probability * newsIDKeywordsCount)); } } // sb.append(" ").append(newsID); // context.write(new Text(sb.toString()), new Text(newsID)); context.write(new Text(sb.toString()), NullWritable.get()); } } // ---------------?--------------------------------------------------------- // ?? public static int Cardinality = 2029; public static void matrix2Vector(Configuration conf, Path path) throws IOException { FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = null; // ??SequenceFile????Name?? reader = new SequenceFile.Reader(fs, path, conf); Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); Writable val = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); Writer writer = null; try { writer = SequenceFile.createWriter(fs, conf, path, IntWritable.class, VectorWritable.class, CompressionType.BLOCK); final IntWritable key1 = new IntWritable(); final VectorWritable value = new VectorWritable(); int lineNum = 0; Vector vector = null; while (reader.next(key, val)) { int index = 0; StringTokenizer st = new StringTokenizer(val.toString()); // SequentialAccessSparseVector??NamedVector vector = new NamedVector(new SequentialAccessSparseVector(Cardinality), lineNum + ""); while (st.hasMoreTokens()) { if (Integer.parseInt(st.nextToken()) == 1) { vector.set(index, 1); } index++; } key1.set(lineNum++); value.set(vector); writer.append(key, value); } } finally { writer.close(); reader.close(); } } public static void clusterOutput(Configuration conf, Path path) { try { BufferedWriter bw; FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = null; reader = new SequenceFile.Reader(fs, path, conf); // ?uidOfgrp.txt?? uid \t groupID bw = new BufferedWriter(new FileWriter(new File("C:\\Users\\Hk\\Desktop\\ClusterPointsInfo.txt"))); HashMap<String, Integer> clusterIds; clusterIds = new HashMap<String, Integer>(120); IntWritable key = new IntWritable(); WeightedPropertyVectorWritable value = new WeightedPropertyVectorWritable(); // WeightedVectorWritable value = new WeightedVectorWritable(); while (reader.next(key, value)) { NamedVector vector = (NamedVector) value.getVector(); // VectorName String vectorName = vector.getName(); System.out.println(vectorName + "\t" + key.toString()); bw.write(vectorName + "\t" + key.toString() + "\n"); // ?group? if (clusterIds.containsKey(key.toString())) { clusterIds.put(key.toString(), clusterIds.get(key.toString()) + 1); } else clusterIds.put(key.toString(), 1); } bw.flush(); reader.close(); // ?group?grpSize bw = new BufferedWriter(new FileWriter(new File("C:\\Users\\Hk\\Desktop\\ClusterPointsSize.txt"))); Set<String> keys = clusterIds.keySet(); for (String k : keys) { System.out.println(k + " " + clusterIds.get(k)); bw.write(k + " " + clusterIds.get(k) + "\n"); } bw.flush(); bw.close(); } catch (IOException e) { e.printStackTrace(); } } public static void clusterOutput2(Configuration conf, Path path) { try { FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); IntWritable key = new IntWritable(); WeightedPropertyVectorWritable value = new WeightedPropertyVectorWritable(); while (reader.next(key, value)) { System.out.println(value.toString() + " belongs to cluster " + key.toString()); } reader.close(); } catch (IOException e) { e.printStackTrace(); } } }