hk.newsRecommender.TFIDF.java Source code

Introduction

Here is the source code for hk.newsRecommender.TFIDF.java
Source

/**
 * $RCSfile: TFIDF.java
 * $Revision: 1.0
 * $Date: 2015-6-9
 *
 * Copyright (C) 2015 EastHope, Inc. All rights reserved.
 *
 * Use is subject to license terms.
 */
package hk.newsRecommender;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class TFIDF {
    static long totalArticle = 100;

    public static void main(String[] args) throws Exception {
        // part1----------------------------------------------------
        Configuration conf0 = new Configuration();
        // DF()
        // FileSystem hdfs = FileSystem.get(conf1);
        // FileStatus p[] = hdfs.listStatus(new Path(args[0]));
        String hdfsUrl0 = conf0.get("fs.defaultFS");

        // ???NumReduceTasks
        Job job0 = Job.getInstance(conf0, "My_tdif_part0");
        job0.setJarByClass(TFIDF.class);
        job0.setMapperClass(Mapper_Part0.class);
        // job1.setCombinerClass(Combiner_Part1.class); // combiner?
        job0.setReducerClass(Reduce_Part0.class);
        job0.setMapOutputKeyClass(Text.class);
        job0.setMapOutputValueClass(Text.class);
        job0.setOutputKeyClass(Text.class);
        job0.setOutputValueClass(Text.class);
        // job1.setNumReduceTasks(p.length);

        FileInputFormat.addInputPath(job0, new Path(hdfsUrl0 + "/data/recommend/data2.txt"));
        FileOutputFormat.setOutputPath(job0, new Path(hdfsUrl0 + "/data/recommend/tfidf0"));

        job0.waitForCompletion(true);

        // part1----------------------------------------------------
        Configuration conf1 = new Configuration();
        // DF()
        // FileSystem hdfs = FileSystem.get(conf1);
        // FileStatus p[] = hdfs.listStatus(new Path(args[0]));
        String hdfsUrl = conf1.get("fs.defaultFS");

        // ???NumReduceTasks
        Job job1 = Job.getInstance(conf1, "My_tdif_part1");
        job1.setJarByClass(TFIDF.class);
        job1.setMapperClass(Mapper_Part1.class);
        // job1.setCombinerClass(Combiner_Part1.class); // combiner?
        job1.setReducerClass(Reduce_Part1.class);
        job1.setMapOutputKeyClass(Text.class);
        job1.setMapOutputValueClass(Text.class);
        job1.setOutputKeyClass(Text.class);
        job1.setOutputValueClass(Text.class);
        // job1.setNumReduceTasks(p.length);
        job1.setPartitionerClass(MyPartitoner.class); // MyPartitoner

        FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/tfidf0"));
        FileOutputFormat.setOutputPath(job1, new Path(hdfsUrl + "/data/recommend/tfidf1"));

        job1.waitForCompletion(true);
        // part2----------------------------------------
        Configuration conf2 = new Configuration();

        Job job2 = Job.getInstance(conf2, "My_tdif_part2");
        job2.setJarByClass(TFIDF.class);
        job2.setMapOutputKeyClass(Text.class);
        job2.setMapOutputValueClass(Text.class);
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(Text.class);
        job2.setMapperClass(Mapper_Part2.class);
        job2.setReducerClass(Reduce_Part2.class);
        // job2.setNumReduceTasks(p.length);

        FileInputFormat.setInputPaths(job2, new Path(hdfsUrl + "/data/recommend/tfidf1"));
        FileOutputFormat.setOutputPath(job2, new Path(hdfsUrl + "/data/recommend/tfidf2"));

        job2.waitForCompletion(true);

        //      part3----------------------------------------
        //      Configuration conf3 = new Configuration();
        //      
        //      Job job3 = Job.getInstance(conf3, "My_tdif_part3");
        //      job3.setJarByClass(TFIDF.class);
        //      job3.setMapOutputKeyClass(Text.class);
        //      job3.setMapOutputValueClass(Text.class);
        //      job3.setOutputKeyClass(Text.class);
        //      job3.setOutputValueClass(Text.class);
        //      job3.setMapperClass(Mapper_Part3.class);
        //      job3.setReducerClass(Reduce_Part3.class);
        //      // job2.setNumReduceTasks(p.length);
        //      
        //      FileInputFormat.setInputPaths(job3, new Path(hdfsUrl + "/data/recommend/tfidf2"));
        //      FileOutputFormat.setOutputPath(job3, new Path(hdfsUrl + "/data/recommend/tfidf3"));
        //      
        //      job3.waitForCompletion(true);
        //      hdfs.delete(new Path(args[1]), true);
    }

    // part1---------------?---------------------------------------------------------
    public static class Mapper_Part0 extends Mapper<LongWritable, Text, Text, Text> {
        //      long totalLine=0L;
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //         totalLine++;
            String[] lineSplits = value.toString().split("\t");
            String newsID = lineSplits[1];
            String content = lineSplits[4];
            context.write(new Text(newsID + " " + content), new Text(""));
        }
        //      public void cleanup(Context context) throws IOException, InterruptedException {
        //         // Map?
        //         String str = "";
        //         str += totalLine;
        //         context.write(new Text("!totalLine"), new Text(str));
        //         // ? "!" !ascii??
        //      }
    }

    public static class Reduce_Part0 extends Reducer<Text, Text, Text, Text> {
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            context.write(key, new Text(""));
        }
    }

    // part1------------------------------------------------------------------------
    public static class Mapper_Part1 extends Mapper<LongWritable, Text, Text, Text> {
        String newsID = ""; // ??????
        String content = "";
        String word;

        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            int all = 0; // ??
            String[] lineSplits = value.toString().split(" ");
            newsID = lineSplits[0];
            content = lineSplits[1];

            Analyzer analyzer = new IKAnalyzer(false);
            TokenStream ts = analyzer.tokenStream("", new StringReader(content));
            ts.reset();
            CharTermAttribute cta = ts.getAttribute(CharTermAttribute.class);
            Map<String, Long> splitWordMap = new HashMap<String, Long>();
            while (ts.incrementToken()) {
                word = cta.toString();
                word += " ";
                word += newsID;
                all++;
                if (splitWordMap.containsKey(word))
                    splitWordMap.put(word, splitWordMap.get(word) + 1);
                else
                    splitWordMap.put(word, 1L);
            }
            Iterator iter = splitWordMap.entrySet().iterator();
            while (iter.hasNext()) {
                Map.Entry<String, Long> entry = (Map.Entry<String, Long>) iter.next();
                String key1 = entry.getKey();
                Long val = entry.getValue();
                context.write(new Text(key1), new Text((Float.parseFloat(val.toString()) / all) + ""));
            }
        }

        //      public void cleanup(Context context) throws IOException, InterruptedException {
        //         // Map?
        //         String str = "";
        //         str += totalLine;
        //         context.write(new Text("!totalLine"), new Text(str));
        //         // ? "!" !ascii??
        //      }
    }

    //   public static class Combiner_Part1 extends Reducer<Text, Text, Text, Text> {
    //      float all = 0;
    //      public void reduce(Text key, Iterable<Text> values, Context context) throws IOException,
    //            InterruptedException {
    //         int index = key.toString().indexOf(" ");
    //         // !ascii?map??!
    //         if (key.toString().substring(index + 1, index + 2).equals("!")) {
    //            for (Text val : values) {
    //               all = Integer.parseInt(val.toString());
    //            }
    //            return;// key-value
    //         }
    //         float sum = 0; // ???
    //         for (Text val : values) {
    //            sum += Integer.parseInt(val.toString());
    //         }
    //         // ???? TF(?) = sum / all
    //         float tmp = sum / all;
    //         String value = "";
    //         value += tmp; // ?
    //
    //         // key?????es: test1 hello -> hello test1
    //         String p[] = key.toString().split(" ");
    //         String key_to = "";
    //         key_to += p[1];
    //         key_to += " ";
    //         key_to += p[0];
    //         context.write(new Text(key_to), new Text(value));
    //      }
    //   }

    public static class Reduce_Part1 extends Reducer<Text, Text, Text, Text> {
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            for (Text val : values) {
                context.write(key, val);
            }
        }
    }

    public static class MyPartitoner extends Partitioner<Text, Text> {
        // Partitioner
        public int getPartition(Text key, Text value, int numPartitions) {
            // ?
            // es test1 test2
            String ip1 = key.toString();
            ip1 = ip1.substring(0, ip1.indexOf(" "));
            Text p1 = new Text(ip1);
            return Math.abs((p1.hashCode() * 127) % numPartitions);
        }
    }

    // part2-----------------------------------------------------
    public static class Mapper_Part2 extends Mapper<LongWritable, Text, Text, Text> {
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String val = value.toString().replaceAll("   ", " "); // vlaueTAB??
            int index = val.indexOf(" ");
            String s1 = val.substring(0, index); // ??? key es: hello
            String s2 = val.substring(index + 1); //  value es: test1 0.11764706
            s2 += " ";
            s2 += "1"; // ??, 1?  es: test1 0.11764706 1
            context.write(new Text(s1), new Text(s2));
        }
    }

    public static class Reduce_Part2 extends Reducer<Text, Text, Text, Text> {
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            // ?????group
            float sum = 0;
            List<String> vals = new ArrayList<String>();
            for (Text str : values) {
                int index = str.toString().lastIndexOf(" ");
                sum += Integer.parseInt(str.toString().substring(index + 1)); // ??
                vals.add(str.toString().substring(0, index)); // ?
            }
            double tmp = Math.log10(totalArticle * 1.0 / (sum * 1.0)); // ??IDF
            for (int j = 0; j < vals.size(); j++) {
                String val = vals.get(j);
                String newsID = val.substring(0, val.indexOf(" "));
                String end = val.substring(val.lastIndexOf(" "));
                float f_end = Float.parseFloat(end); // ?TF
                val += " ";
                val += f_end * tmp; // tf-idf
                //            context.write(key, new Text(val));
                context.write(new Text(newsID), new Text(key + " " + val.substring(val.indexOf(" ") + 1)));
            }
        }
    }

    //   public static class Mapper_Part3 extends Mapper<LongWritable, Text, Text, Text> {
    //      public void map(LongWritable key, Text value, Context context) throws IOException,
    //            InterruptedException {
    //         String val = value.toString().replaceAll("   ", " "); // vlaueTAB??
    //         int index = val.indexOf(" ");
    //         String s1 = val.substring(0, index); // ??? key es: hello
    //         String s2 = val.substring(index + 1); //  value es: test1 0.11764706
    //         context.write(new Text(s1), new Text(s2));
    //      }
    //   }
    //   
    //   public static class Reduce_Part3 extends Reducer<Text, Text, Text, Text> {
    //      public void reduce(Text key, Iterable<Text> values, Context context) throws IOException,
    //            InterruptedException {
    //         int limit=0;
    //         for (Text str : values) {
    //            if(++limit<=5){
    //               context.write(key, str);
    //            }
    //         }
    //      }
    //   }

}