List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:babel.prep.merge.PageMergeReducer.java
License:Apache License
public void reduce(Text key, Iterator<Page> pages, OutputCollector<Text, Page> output, Reporter reporter) throws IOException { Page newPage = new Page(key.toString()); int numPages = 0; while (pages.hasNext()) { newPage.merge(pages.next());/*from w w w . j a va2s . c o m*/ numPages++; } PageMerger.Stats.incPageCount(); if (numPages > 1) { PageMerger.Stats.incMergedPageCount(); } output.collect(key, newPage); }
From source file:base.Example.java
License:Open Source License
public Example(Text value) { fids = new int[0]; fvals = new float[0]; labels = new int[0]; docid = 0;//from w w w.ja va2 s.c o m parseString(value.toString()); }
From source file:bbuzz2011.stackoverflow.preprocess.xml.StackOverflowPostXMLMapper.java
License:Apache License
private void writePostBody(LongWritable key, Text value, Context context) throws SAXException, IOException, XPathExpressionException, InterruptedException { // TODO Where counters used? May be for some statistics? // Are counters global and atomic for all mappers? // Where do them output? context.getCounter(StackOverflowPostXMLMapper.Counter.TITLES).increment(1); Document doc = documentBuilder.parse(new InputSource(new StringReader(value.toString()))); // Retrieve title from xml post using xpath String title = (String) postTitleXPath.evaluate(doc, XPathConstants.STRING); if (title == null || title.equals("")) { context.getCounter(Counter.MISSING_TITLES).increment(1); return;//from w w w.ja v a2 s. com } String postHtml = (String) postBodyXPath.evaluate(doc, XPathConstants.STRING); String content = parser.parsePostContent(postHtml); // TODO Why not stackexchange post Id attribute? postKey.set((int) key.get()); postWritable.setTitle(title); postWritable.setContent(content); // Retrieve questions, not answers // TODO as improvement we can combine question and answers as single document for better clustering. if (isQuestion(doc)) { context.getCounter(Counter.QUESTIONS).increment(1); context.write(postKey, postWritable); } }
From source file:be.uantwerpen.adrem.bigfim.AprioriPhaseMapper.java
License:Apache License
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); List<Integer> items = convertLineToSet(line, phase == 1, singletons, delimiter); incrementSubSets(items);/*from w w w. j a va 2 s. c o m*/ }
From source file:be.uantwerpen.adrem.bigfim.AprioriPhaseReducer.java
License:Apache License
@Override public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Map<String, MutableInt> supports = getSupports(values); removeLowSupports(supports);//w w w .j a v a 2 s . co m if (supports.isEmpty()) { return; } String prefix = key.toString(); writeShortFis(prefix, supports); if (!supports.isEmpty()) { writeTrieGroup(prefix, supports); updatePGInfo(prefix, supports); } }
From source file:be.uantwerpen.adrem.bigfim.AprioriPhaseReducer.java
License:Apache License
private Map<String, MutableInt> getSupports(Iterable<Text> values) { Map<String, MutableInt> supports = newHashMap(); for (Text extensionAndSupport : values) { String[] split = extensionAndSupport.toString().split(" "); getFromMap(supports, split[0]).add(parseInt(split[1])); }/*from w w w.j a va 2 s . c om*/ return supports; }
From source file:be.uantwerpen.adrem.bigfim.ComputeTidListMapper.java
License:Apache License
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); List<Integer> items = convertLineToSet(line, phase == 1, singletons, delimiter); reportItemTids(context, items);// w w w. j a va 2s. c o m counter++; }
From source file:be.uantwerpen.adrem.bigfim.ComputeTidListReducer.java
License:Apache License
private void assignToBucket(Text key, Map<Integer, IntArrayWritable[]> map, int totalTids) throws IOException, InterruptedException { int lowestBucket = getLowestBucket(); if (!checkLowestBucket(lowestBucket, totalTids)) { bucketSizes.add(new MutableInt()); lowestBucket = bucketSizes.size() - 1; }//w ww . j a v a2s .c o m bucketSizes.get(lowestBucket).add(totalTids); lowestBucket += pgStartIndex; String baseOutputPath = basePGDir + "/bucket-" + lowestBucket; mos.write(IntArrayWritable.of(key.toString()), EmptyImw, baseOutputPath); for (Entry<Integer, IntArrayWritable[]> entry : map.entrySet()) { IntArrayWritable owKey = IntArrayWritable.of(entry.getKey()); IntMatrixWritable owValue = new IntMatrixWritable(entry.getValue()); mos.write(owKey, owValue, baseOutputPath); } mos.write(EmptyIaw, EmptyImw, baseOutputPath); }
From source file:be.uantwerpen.adrem.disteclat.PrefixComputerMapper.java
License:Apache License
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] split = value.toString().split("\t"); String items = split[1];/*from w w w. j a v a 2 s . c o m*/ // if the prefix length is 1, just report the singletons, otherwise use // Eclat to find X-FIs seeds EclatMiner miner = new EclatMiner(); SetReporter reporter = new PrefixItemTIDsReporter(context, prefixLength, singletons, orderMap); miner.setSetReporter(reporter); miner.setMaxSize(prefixLength); for (String itemStr : items.split(" ")) { final int itemIx = orderMap.get(Integer.valueOf(itemStr)); final Item item = singletons.get(itemIx); assert (item.id == parseInt(itemStr)); List<Item> extensions = singletons.subList(itemIx + 1, singletons.size()); miner.mineRecByPruning(item, extensions, minSup); } }
From source file:be.uantwerpen.adrem.disteclat.PrefixComputerReducer.java
License:Apache License
private void assignToBucket(Text key, Map<Integer, IntArrayWritable[]> map, int totalTids) throws IOException, InterruptedException { int lowestBucket = getLowestBucket(); if (!checkLowestBucket(lowestBucket, totalTids)) { bucketSizes.add(new MutableInt()); lowestBucket = bucketSizes.size() - 1; }//ww w . ja v a 2s. c o m bucketSizes.get(lowestBucket).add(totalTids); String baseOutputPath = "bucket-" + lowestBucket; mos.write(IntArrayWritable.of(key.toString()), EmptyImw, baseOutputPath); for (Entry<Integer, IntArrayWritable[]> entry : map.entrySet()) { IntArrayWritable owKey = IntArrayWritable.of(entry.getKey()); IntMatrixWritable owValue = new IntMatrixWritable(entry.getValue()); mos.write(owKey, owValue, baseOutputPath); } mos.write(EmptyIaw, EmptyImw, baseOutputPath); }