Java tutorial
/******************************************************************************* * Copyright 2013 Gebre * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package nl.cwi.kba.apps; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Hashtable; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import nl.cwi.kba.thrift.bin.StreamItemWritable; import nl.cwi.kba.thrift.bin.ThriftFileInputFormat; import nl.cwi.json.topics.Filter_topics; import nl.cwi.json2012.run.Filter_run; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.LineReader; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; /** * Toy KBA system similar to the Python version. It identifies entities in * documents based on mere lexical matching. * * @uthor Gebremeskel * */ public class EntitySurfaceForms extends Configured implements Tool { public static final String QUERYFILEPATH_HDFS = "kba.topicfilelocation"; public static final String LABELSFILEPATH_HDFS = "kba.labels"; public static final String PPR_HDFS = "ppr"; public static final String GCLD_HDFS = "gcld"; public static Map<String, HashSet<String>> DbPedia = new LinkedHashMap<String, HashSet<String>>(); public static Map<String, HashMap<String, Double>> Ppr = new LinkedHashMap<String, HashMap<String, Double>>(); public static Map<String, HashMap<String, Double>> Gcld = new LinkedHashMap<String, HashMap<String, Double>>(); // public static Map<String, String> Attr ; public static final String RUNTAG = "kba.runtag"; public static final String TEAMNAME = "kba.teamname"; protected enum Counter { documents }; private static final Logger LOG = Logger.getLogger(EntitySurfaceForms.class); public static class MyReducer extends Reducer<Text, Text, Text, Text> { private static final Text out = new Text(); private static final Text none = new Text(""); private String runtag; private String teamname; @Override public void reduce(Text date, Iterable<Text> values, Context context) throws IOException, InterruptedException { context.write(out, none); } } /** * Emits date, PairOfStringLong pairs, where the string contains the docno * and the topic and the long contains the score. * * @author emeij * */ public static class MyMapper extends Mapper<Text, StreamItemWritable, Text, Text> { public static final int scale = 1000; private static final Text stream_doc_id = new Text(); // private static final Text featurLine = new Text(); private Map<String, Pattern> topicregexes = new LinkedHashMap<String, Pattern>(); private Map<String, HashSet<String>> partialtopics = new LinkedHashMap<String, HashSet<String>>(); private Filter_topics ft = null; /** * Used to load the queries. */ @Override public void setup(Context context) throws IOException, InterruptedException { super.setup(context); DbPedia = loadLabels(LABELSFILEPATH_HDFS); Ppr = loadPPR(PPR_HDFS); Gcld = loadGCLD(GCLD_HDFS); loadTopics(QUERYFILEPATH_HDFS); } /** * Loads the queries from the JSON topic file. * * @param context */ /** * Not used */ @Override protected void cleanup(Context context) throws IOException, InterruptedException { super.cleanup(context); } @Override public void map(Text key, StreamItemWritable value, Context context) throws IOException, InterruptedException { context.getCounter(Counter.documents).increment(1); // partialtopics = DbPedia; String body = new String(value.getBody().getCleansed()); String title = new String(value.getTitle().getRaw().toString()); String anchor = new String(value.getAnchor().getRaw().toString()); // lowercase all body = body.toLowerCase(); title = title.toLowerCase(); anchor = anchor.toLowerCase(); String streamid = value.getStream_id(); for (String topic : topicregexes.keySet()) { String stream_id_doc_id = streamid + "\t" + topic; Integer canonical_value = 0; Integer surfaceForm_value = 0; Integer ppr_value = 0; Integer gcld_value = 0; Double temp = 0.0; for (String partial : partialtopics.get(topic)) { if (containsMention(title + body + anchor, new HashSet<String>(Arrays.asList(partial))) == 1) { temp = temp + (double) partial.length() / topic.length(); } } temp = temp * scale; canonical_value = temp.intValue(); temp = 0.0; for (String s_form : DbPedia.get(topic)) { if (containsMention(title + body + anchor, new HashSet<String>(Arrays.asList(s_form))) == 1) { temp = temp + (double) s_form.length() / topic.length(); } } temp = temp * scale; surfaceForm_value = temp.intValue(); temp = 0.0; for (String gcld_key : Gcld.get(topic).keySet()) { if (containsMention(title + body + anchor, new HashSet<String>(Arrays.asList(gcld_key))) == 1) { temp = temp + Gcld.get(topic).get(gcld_key); } } temp = temp * scale; gcld_value = temp.intValue(); temp = 0.0; System.out.println("It is dying at " + topic); for (String ppr_key : Ppr.get(topic).keySet()) { if (containsMention(title + body + anchor, new HashSet<String>(Arrays.asList(ppr_key))) == 1) { temp = temp + Ppr.get(topic).get(ppr_key); } } temp = temp * scale; ppr_value = temp.intValue(); if ((canonical_value + surfaceForm_value + gcld_value + ppr_value) > 0) context.write(new Text("CWI" + "\t" + "toy_j48" + "\t" + stream_id_doc_id + "\t"), new Text(canonical_value.toString() + "\t" + surfaceForm_value.toString() + "\t" + ppr_value.toString() + "\t" + gcld_value.toString())); } } public int containsMention(String str, HashSet<String> hs) { int i = 0; // System.out.println(); for (String entity : hs) { if (str.contains(entity)) { i = 1; } } return i; } private void loadTopics(String xcontext) { DataInputStream in = null; try { in = new DataInputStream(new FileInputStream(xcontext)); BufferedReader br = new BufferedReader(new InputStreamReader(in)); ft = new Filter_topics.Factory().loadTopics(br); LOG.info(ft.getTopic_set_id()); for (String t : ft.getTopic_names()) { Pattern p; // add the full name p = Pattern.compile(".*\\b+" + t.replaceAll("_", " ") + "\\b+.*", Pattern.CASE_INSENSITIVE); topicregexes.put(t, p); // add the individual terms HashSet<String> pset = new HashSet<String>(); pset.addAll(new HashSet<String>(Arrays.asList(t.split("_")))); pset.add(t.replaceAll("_", " ").toLowerCase()); partialtopics.put(t, pset); } } catch (IOException e) { e.printStackTrace(); LOG.error("read from distributed cache: read instances"); } catch (Exception e) { e.printStackTrace(); LOG.error("read from distributed cache: " + e); } finally { if (in != null) { try { in.close(); } catch (IOException e1) { e1.printStackTrace(); } } } } private Map loadLabels(String xcontext) { // Hashtable<String, Hashtable> prep = new Hashtable(); Map<String, HashSet<String>> prep = new LinkedHashMap<String, HashSet<String>>(); Hashtable<String, HashSet> tmp = new Hashtable(); DataInputStream in = null; // System.out.println("This is from LoadLabels"); System.out.println("\nwhat am I opening :" + xcontext); // System.out.println("\nDBpedia File :"+ LABELSFILEPATH_HDFS); // System.out.println("\nDBpedia File :"+LABELSFILEPATH_HDFS); try { in = new DataInputStream(new FileInputStream(xcontext)); BufferedReader br = new BufferedReader(new InputStreamReader(in)); String strLine; // Read File Line By Line while ((strLine = br.readLine()) != null) { // Print the content on the console String temp_key; String temp_value; String[] str = new String(strLine.getBytes("UTF-8"), "UTF-8").split("\t"); temp_key = str[0]; temp_value = str[1]; // System.out.println("length " + str.length); HashSet<String> keys = (HashSet<String>) tmp.get(temp_key); if (keys == null) { keys = new HashSet<String>(); tmp.put(temp_key, keys); } keys.add(temp_value.toLowerCase()); // keys.add(temp_value); } for (Object entity : tmp.keySet()) { prep.put((String) entity, tmp.get(entity)); } } catch (IOException e) { e.printStackTrace(); LOG.error("read from distributed cache: read instances"); } catch (Exception e) { e.printStackTrace(); LOG.error("read from distributed cache: " + e); } finally { if (in != null) { try { in.close(); } catch (IOException e1) { e1.printStackTrace(); } } } return prep; } private Map loadPPR(String xcontext) { // Hashtable<String, Hashtable> prep = new Hashtable(); Map<String, HashMap<String, Double>> prep = new LinkedHashMap<String, HashMap<String, Double>>(); Hashtable<String, HashSet> tmp = new Hashtable(); DataInputStream in = null; // String xcontext // ="/ufs/gebre/Downloads/trec-kba-ccr-topics-2012.ppr-top100.txt"; // System.out.println("This is from LoadLabels"); // System.out.println("\nwhat am I opening :" + xcontext); // System.out.println("\nDBpedia File :"+ LABELSFILEPATH_HDFS); // System.out.println("\nPPr from loadPPR :"+xcontext); try { in = new DataInputStream(new FileInputStream(xcontext)); BufferedReader br = new BufferedReader(new InputStreamReader(in)); String strLine; // Read File Line By Line String key_check = ""; HashMap<String, Double> intermediat = new HashMap<String, Double>(); while ((strLine = br.readLine()) != null) { // Print the content on the console // System.out.println("\nFrom PPR strLin :"+strLine); String temp_key; Double temp_value; String temp_str; String[] str = new String(strLine.getBytes("UTF-8"), "UTF-8").split("\t"); String[] entity = str[1].split(" "); int i = 0; temp_key = entity[i]; while (++i < entity.length) { temp_key = temp_key + "_" + entity[i]; } temp_str = str[3]; if (key_check == "") { key_check = temp_key; // System.out.println( " Executed ! :) "+ key_check ); } temp_value = Double.parseDouble(str[4]); if (!key_check.equalsIgnoreCase(temp_key)) { // System.out.println(" When: " + temp_key); if (key_check.equalsIgnoreCase("William_H._Gates,_Sr.")) { key_check = "William_H._Gates,_Sr"; } prep.put(key_check, intermediat); intermediat.clear(); key_check = temp_key; } intermediat.put(temp_str.toLowerCase(), temp_value); } if (key_check.equalsIgnoreCase("William_H._Gates,_Sr.")) { key_check = "William_H._Gates,_Sr"; } prep.put(key_check, intermediat); } catch (IOException e) { e.printStackTrace(); LOG.error("read from distributed cache: read instances"); } catch (Exception e) { e.printStackTrace(); LOG.error("read from distributed cache: " + e); } finally { if (in != null) { try { in.close(); } catch (IOException e1) { e1.printStackTrace(); } } } return prep; } private Map loadGCLD(String xcontext) { // Hashtable<String, Hashtable> prep = new Hashtable(); Map<String, HashMap<String, Double>> prep = new LinkedHashMap<String, HashMap<String, Double>>(); Hashtable<String, HashSet> tmp = new Hashtable(); DataInputStream in = null; // String xcontext // ="/ufs/gebre/Downloads/trec-kba-ccr-topics-2012.ppr-top100.txt"; // System.out.println("This is from LoadLabels"); // System.out.println("\nwhat am I opening :" + xcontext); // System.out.println("\nDBpedia File :"+ LABELSFILEPATH_HDFS); // System.out.println("\nDBpedia File :"+LABELSFILEPATH_HDFS); try { in = new DataInputStream(new FileInputStream(xcontext)); BufferedReader br = new BufferedReader(new InputStreamReader(in)); String strLine; // Read File Line By Line String key_check = ""; HashMap<String, Double> intermediat = new HashMap<String, Double>(); while ((strLine = br.readLine()) != null) { // Print the content on the console String temp_key; Double temp_value; String temp_str; String[] str = new String(strLine.getBytes("UTF-8"), "UTF-8").split("\t"); temp_str = str[0]; temp_key = str[2]; temp_value = Double.parseDouble(str[1]) * Double.parseDouble(str[4]); if (key_check == "") { key_check = temp_key; // System.out.println( " Executed ! :) "+ key_check ); } if (!key_check.equalsIgnoreCase(temp_key)) { // System.out.println(" When ! :) " + temp_key); prep.put(key_check, intermediat); intermediat.clear(); key_check = temp_key; } intermediat.put(temp_str.toLowerCase(), temp_value); } prep.put(key_check, intermediat); } catch (IOException e) { e.printStackTrace(); LOG.error("read from distributed cache: read instances"); } catch (Exception e) { e.printStackTrace(); LOG.error("read from distributed cache: " + e); } finally { if (in != null) { try { in.close(); } catch (IOException e1) { e1.printStackTrace(); } } } return prep; } } /** * Loads the JSON topic file. * * @param context */ private static void loadTopicData(String queryfile, Filter_run fr, FileSystem fs, HashMap<String, Object> run_info) { FSDataInputStream in = null; try { in = fs.open(new Path(queryfile)); BufferedReader br = new BufferedReader(new InputStreamReader(in)); Filter_topics ft = new Filter_topics.Factory().loadTopics(br); fr.setTopic_set_id(ft.getTopic_set_id()); run_info.put("num_entities", ft.getTopic_names().size()); } catch (IOException e1) { e1.printStackTrace(); } catch (Exception e1) { e1.printStackTrace(); } finally { if (in != null) { try { in.close(); } catch (IOException e1) { e1.printStackTrace(); } } } } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new EntitySurfaceForms(), args); System.exit(res); } static int printUsage() { System.out.println("Usage: " + EntitySurfaceForms.class.getName() + " -i input -o output -q query_file (hdfs) -l labels_file -p ppr_file -g -gcld_file [-c corpus_id -r runtag -t teamname -d description] \n\n" + "Example usage: hadoop jar trec-kba.jar " + EntitySurfaceForms.class.getName() + " " + "-i KBA/Data/kba-stream-corpus-2012-cleansed-only-testing-out/*/* " + "-o KBA/OutPut/Kba_2012/cwi-cano_sform_ppr_gcld" + " -q KBA/Data/trec-kba-ccr-2012.filter-topics.json" + "-l KBA/Data/entity-surface-forms_merged.txt" + "-p KBA/Data/trec-kba-ccr-topics-2012.ppr-top100.txt" + " -g KBA/Data/dictionary.txt -r entity-sforms -t CWI -d \"scores doc-entity pair based on surface forms \"" + ">/export/scratch2/TREC-kba/TREC-KBA-2013/data/multi-step/Results/graphs/String-matching/cwi-cano_sform_ppr_gcld"); ToolRunner.printGenericCommandUsage(System.out); return -1; } @Override public int run(String[] args) throws Exception { String in = null; String out = null; String queryfile = null; String systemdescription = null; String corpus_id = null; String runtag = null; String teamname = null; String gcldFile = null; String labelsFile = null; String pprFile = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-p".equals(args[i])) { pprFile = args[++i]; } else if ("-g".equals(args[i])) { gcldFile = args[++i]; } else if ("-c".equals(args[i])) { corpus_id = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); LOG.info(" - runtag: " + runtag); LOG.info(" - teamname: " + teamname); LOG.info(" - corpus_id: " + corpus_id); LOG.info(" - run description: " + systemdescription); Filter_run fr = new Filter_run.Factory().create(TEAMNAME, RUNTAG, systemdescription, corpus_id); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(PPR_HDFS, new Path(pprFile).toUri().toString()); conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Toy KBA system"); job.setJarByClass(EntitySurfaceForms.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // job.setCombinerClass(MyReducer.class); // job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time_secs", ((double) cputime / 1000d)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); System.out.println("#" + new Filter_run.Factory().toJSON(fr)); Text line = new Text(); LineReader reader = new LineReader(fs.open(new Path(out + "/part-r-00000"))); for (int i = 0; i < num_filter_results; i++) { reader.readLine(line); System.out.println(line.toString()); } System.out.println("#" + new Filter_run.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#")); return status; } }