hk.newsRecommender.UserTag.java Source code

Introduction

Here is the source code for hk.newsRecommender.UserTag.java
Source

/**
 * $RCSfile: UserTag.java
 * $Revision: 1.0
 * $Date: 2015-6-24
 *
 * Copyright (C) 2015 EastHope, Inc. All rights reserved.
 *
 * Use is subject to license terms.
 */
package hk.newsRecommender;

import java.io.IOException;
import java.text.NumberFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.mahout.common.HadoopUtil;

public class UserTag {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String hdfsUrl = conf.get("fs.defaultFS");
        FileSystem fs = FileSystem.get(conf);

        Job job1 = Job.getInstance(conf, "generateUserNewsMapping");
        Path output1Path = new Path(hdfsUrl + "/data/recommend/user1");
        HadoopUtil.delete(conf, output1Path);
        job1.setJarByClass(TFIDF.class);
        job1.setMapperClass(Mapper_Part1.class);
        job1.setMapOutputKeyClass(Text.class);
        job1.setMapOutputValueClass(Text.class);
        job1.setOutputKeyClass(Text.class);
        job1.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/data2.txt"));
        FileOutputFormat.setOutputPath(job1, output1Path);
        job1.waitForCompletion(true);

        Job job2 = Job.getInstance(conf, "generateUserNewsCatMapping");
        Path output2Path = new Path(hdfsUrl + "/data/recommend/user2");
        HadoopUtil.delete(conf, output2Path);
        job2.setJarByClass(UserTag.class);
        job2.setMapperClass(Mapper_Part2.class);
        job2.setReducerClass(Reduce_Part2.class);
        job2.setMapOutputKeyClass(Text.class);
        job2.setMapOutputValueClass(Text.class);
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job2, new Path(hdfsUrl + "/data/recommend/user1"));
        FileInputFormat.addInputPath(job2, new Path(hdfsUrl + "/data/recommend/ClusterPointsInfo.txt"));
        FileOutputFormat.setOutputPath(job2, output2Path);
        job2.waitForCompletion(true);

        Job job3 = Job.getInstance(conf, "countUserNewsCatMapping");
        Path output3Path = new Path(hdfsUrl + "/data/recommend/user3");
        HadoopUtil.delete(conf, output3Path);
        job3.setMapperClass(Mapper_Part3.class);
        job3.setReducerClass(Reduce_Part3.class);
        job3.setMapOutputKeyClass(Text.class);
        job3.setMapOutputValueClass(Text.class);
        job3.setOutputKeyClass(Text.class);
        job3.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job3, new Path(hdfsUrl + "/data/recommend/user2"));
        FileOutputFormat.setOutputPath(job3, output3Path);
        job3.waitForCompletion(true);

        Job job4 = Job.getInstance(conf, "generateClusterUniqueRecord");
        Path output4Path = new Path(hdfsUrl + "/data/recommend/user4");
        HadoopUtil.delete(conf, output4Path);
        job4.setMapperClass(Mapper_Part4.class);
        job4.setReducerClass(Reduce_Part4.class);
        job4.setMapOutputKeyClass(Text.class);
        job4.setMapOutputValueClass(Text.class);
        job4.setOutputKeyClass(Text.class);
        job4.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job4, new Path(hdfsUrl + "/data/recommend/ClusterPointsInfo.txt"));
        FileOutputFormat.setOutputPath(job4, output4Path);
        job4.waitForCompletion(true);

        FileSystem fsopen = FileSystem.get(conf);
        FSDataInputStream in = fsopen.open(new Path(hdfsUrl + "/data/recommend/user4/part-r-00000"));
        Scanner scan = new Scanner(in);
        List<String> keywordList = new ArrayList<String>();
        while (scan.hasNext()) {
            keywordList.add(scan.next());
        }
        conf.setStrings("category", keywordList.toArray(new String[keywordList.size()]));
        Path outPath4 = new Path(hdfsUrl + "/data/recommend/user5");
        if (fs.exists(outPath4)) {
            fs.delete(outPath4, true);
            System.out.println("???");
        }
        Job job5 = Job.getInstance(conf, "generateUserPreferableMatrix");
        job5.setMapperClass(Mapper_Part5.class);
        job5.setReducerClass(Reduce_Part5.class);
        job5.setMapOutputKeyClass(Text.class);
        job5.setMapOutputValueClass(Text.class);
        job5.setOutputKeyClass(Text.class);
        job5.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job5, new Path(hdfsUrl + "/data/recommend/user3"));
        FileOutputFormat.setOutputPath(job5, new Path(hdfsUrl + "/data/recommend/user5"));
        job5.waitForCompletion(true);

    }

    //    part0---------------?----------------------------------------------------------
    public static class Mapper_Part1 extends Mapper<LongWritable, Text, Text, Text> {
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] lineSplits = value.toString().split("\t");
            String publishTime = lineSplits[5];
            Calendar cal1 = Calendar.getInstance();
            try {
                cal1.setTime(new SimpleDateFormat("yyyyMMddHH:mm").parse(publishTime));
                publishTime = Long.toString(cal1.getTimeInMillis());
            } catch (Exception e) {
                publishTime = "0";
            }
            context.write(new Text(lineSplits[0]), new Text(lineSplits[1] + "|" + publishTime));
        }
    }

    //    part1------------------------------------------------------------------------
    public static class Mapper_Part2 extends Mapper<LongWritable, Text, Text, Text> {
        private String flag;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            FileSplit split = (FileSplit) context.getInputSplit();
            flag = split.getPath().getName();// ?
        }

        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] lineSplits = value.toString().split("\t");
            if (flag.equals("ClusterPointsInfo.txt")) {
                context.write(new Text(lineSplits[0]), new Text("A" + lineSplits[1]));
            } else {
                context.write(new Text(lineSplits[1]), new Text("B" + lineSplits[0]));
            }
        }
    }

    public static class Reduce_Part2 extends Reducer<Text, Text, Text, Text> {
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            List<String> userIdList = new ArrayList<String>();
            String category = "";
            for (Text text : values) {
                if (text.toString().startsWith("B"))
                    userIdList.add(text.toString().substring(1));
                else
                    category = text.toString().substring(1);
            }

            for (String userId : userIdList) {
                context.write(new Text(userId), new Text(key + " " + category));
            }
        }
    }

    //    part2---------------?----------------------------------------------------------
    public static class Mapper_Part3 extends Mapper<LongWritable, Text, Text, Text> {
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] lineSplits = value.toString().split("\t");
            String[] lineVal = lineSplits[1].split(" ");
            context.write(new Text(lineSplits[0] + " " + lineVal[1]), new Text(""));
        }
    }

    public static class Reduce_Part3 extends Reducer<Text, Text, Text, Text> {
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            int counter = 0;
            for (Text text : values) {
                counter++;
            }

            context.write(key, new Text(counter + ""));
        }
    }

    //   part3---------------?????---------------------------------------------------------
    public static class Mapper_Part4 extends Mapper<LongWritable, Text, Text, Text> {
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] lineSplits = value.toString().split("\t");
            context.write(new Text(lineSplits[1]), new Text(""));
        }
    }

    public static class Reduce_Part4 extends Reducer<Text, Text, Text, Text> {
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            context.write(key, new Text(""));
        }
    }

    //    part4---------------???---------------------------------------------------------
    public static class Mapper_Part5 extends Mapper<LongWritable, Text, Text, Text> {
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] lineSplits = value.toString().split("\t");
            String[] keyStr = lineSplits[0].split(" ");
            context.write(new Text(keyStr[0]), new Text(keyStr[1] + " " + lineSplits[1]));
        }
    }

    public static class Reduce_Part5 extends Reducer<Text, Text, Text, Text> {
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            Map<String, String> userCatMap = new HashMap<String, String>();
            StringBuilder sb = new StringBuilder();
            Configuration conf = context.getConfiguration();
            String[] catArray = conf.getStrings("category");
            float scoreSum = 0.0f;
            NumberFormat nf = NumberFormat.getNumberInstance();
            nf.setMinimumFractionDigits(2);

            for (Text text : values) {
                String[] valueSplit = text.toString().split(" ");
                userCatMap.put(valueSplit[0], valueSplit[1]);
                scoreSum += Float.parseFloat(valueSplit[1]);
            }
            for (String cat : catArray) {
                if (userCatMap.containsKey(cat)) {
                    sb.append(nf.format(Integer.parseInt(userCatMap.get(cat)) / scoreSum)).append(" ");
                } else
                    sb.append(nf.format(0)).append(" ");
            }
            context.write(key, new Text(sb.deleteCharAt(sb.length() - 1).toString()));
        }
    }

}