List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:com.datasalt.pangool.tuplemr.serialization.TestTupleFieldSerialization.java
License:Apache License
@SuppressWarnings("deprecation") @Test/*from w w w.j av a 2 s .c o m*/ public void test() throws Exception { initHadoop(); trash(INPUT1, INPUT2, OUTPUT); // Prepare input BufferedWriter writer; // INPUT1 writer = new BufferedWriter(new FileWriter(INPUT1)); writer.write("foo1" + "\t" + "30" + "\n"); writer.write("foo2" + "\t" + "20" + "\n"); writer.write("foo3" + "\t" + "140" + "\n"); writer.write("foo4" + "\t" + "110" + "\n"); writer.write("foo5" + "\t" + "220" + "\n"); writer.write("foo6" + "\t" + "260" + "\n"); writer.close(); // INPUT2 writer = new BufferedWriter(new FileWriter(INPUT2)); writer.write("4.5" + "\t" + "true" + "\n"); writer.write("4.6" + "\t" + "false" + "\n"); writer.close(); TupleMRBuilder builder = new TupleMRBuilder(getConf()); final Schema tupleSchema1 = new Schema("tupleSchema1", Fields.parse("a:string, b:int")); final Schema tupleSchema2 = new Schema("tupleSchema2", Fields.parse("c:double, d:boolean")); List<Field> fields = new ArrayList<Field>(); fields.add(Field.create("partitionId", Type.INT)); fields.add(Fields.createTupleField("tuple1", tupleSchema1)); final Schema schema1 = new Schema("tupleInTuple1", fields); fields.clear(); fields.add(Field.create("partitionId", Type.INT)); fields.add(Fields.createTupleField("tuple2", tupleSchema2)); final Schema schema2 = new Schema("tupleInTuple2", fields); builder.addIntermediateSchema(schema1); builder.addIntermediateSchema(schema2); builder.addInput(new Path(INPUT1), new HadoopInputFormat(TextInputFormat.class), new TupleMapper<LongWritable, Text>() { ITuple tupleInTuple1 = new Tuple(schema1); ITuple tuple1 = new Tuple(tupleSchema1); @Override public void map(LongWritable key, Text value, TupleMRContext context, Collector collector) throws IOException, InterruptedException { String[] split = value.toString().split("\t"); tuple1.set("a", split[0]); tuple1.set("b", Integer.parseInt(split[1])); tupleInTuple1.set("partitionId", 0); tupleInTuple1.set("tuple1", tuple1); collector.write(tupleInTuple1); } }); builder.addInput(new Path(INPUT2), new HadoopInputFormat(TextInputFormat.class), new TupleMapper<LongWritable, Text>() { ITuple tupleInTuple2 = new Tuple(schema2); ITuple tuple2 = new Tuple(tupleSchema2); @Override public void map(LongWritable key, Text value, TupleMRContext context, Collector collector) throws IOException, InterruptedException { String[] split = value.toString().split("\t"); tuple2.set("c", Double.parseDouble(split[0])); tuple2.set("d", Boolean.parseBoolean(split[1])); tupleInTuple2.set("partitionId", 0); tupleInTuple2.set("tuple2", tuple2); collector.write(tupleInTuple2); } }); builder.setTupleReducer(new TupleReducer<Text, NullWritable>() { public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector) throws IOException, InterruptedException, TupleMRException { Iterator<ITuple> iterator = tuples.iterator(); ITuple currentTuple; assertEquals(0, group.get("partitionId")); currentTuple = iterator.next(); assertEquals("foo1", ((ITuple) currentTuple.get("tuple1")).get("a").toString()); assertEquals(30, ((ITuple) currentTuple.get("tuple1")).get("b")); currentTuple = iterator.next(); assertEquals("foo2", ((ITuple) currentTuple.get("tuple1")).get("a").toString()); assertEquals(20, ((ITuple) currentTuple.get("tuple1")).get("b")); currentTuple = iterator.next(); assertEquals("foo3", ((ITuple) currentTuple.get("tuple1")).get("a").toString()); assertEquals(140, ((ITuple) currentTuple.get("tuple1")).get("b")); currentTuple = iterator.next(); assertEquals("foo4", ((ITuple) currentTuple.get("tuple1")).get("a").toString()); assertEquals(110, ((ITuple) currentTuple.get("tuple1")).get("b")); currentTuple = iterator.next(); assertEquals("foo5", ((ITuple) currentTuple.get("tuple1")).get("a").toString()); assertEquals(220, ((ITuple) currentTuple.get("tuple1")).get("b")); currentTuple = iterator.next(); assertEquals("foo6", ((ITuple) currentTuple.get("tuple1")).get("a").toString()); assertEquals(260, ((ITuple) currentTuple.get("tuple1")).get("b")); // Second data source BEGINS currentTuple = iterator.next(); assertEquals(4.5, ((ITuple) currentTuple.get("tuple2")).get("c")); assertEquals(true, ((ITuple) currentTuple.get("tuple2")).get("d")); currentTuple = iterator.next(); assertEquals(4.6, ((ITuple) currentTuple.get("tuple2")).get("c")); assertEquals(false, ((ITuple) currentTuple.get("tuple2")).get("d")); }; }); builder.setGroupByFields("partitionId"); builder.setOutput(new Path(OUTPUT), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class); Job job = builder.createJob(); try { job.waitForCompletion(true); } finally { builder.cleanUpInstanceFiles(); } trash(INPUT1, INPUT2, OUTPUT); }
From source file:com.datasalt.utils.mapred.crossproduct.TestCrossProductMapRed.java
License:Apache License
public void test(boolean twoSteps) throws Exception { createFirstDataSet();//from w w w.ja v a 2s.c om createSecondDataSet(); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); if (twoSteps) { /* * Here we are saying that 1-list elements is enough to collapse the JVM's heap - just for testing */ conf.setInt(CrossProductMapRed.SPLIT_DATASET_SIZE_CONF, 1); } CrossProductMapRed crossProduct = new CrossProductMapRed("Test", conf); crossProduct.setLeftInputPath(new Path(INPUT_1)); crossProduct.setLeftInputFormat(TextInputFormat.class); crossProduct.setLeftInputMapper(Map.class); crossProduct.setRightInputPath(new Path(INPUT_2)); crossProduct.setRightInputFormat(TextInputFormat.class); crossProduct.setRightInputMapper(Map.class); crossProduct.setOutputPath(new Path(OUTPUT)); crossProduct.setOutputFormat(SequenceFileOutputFormat.class); crossProduct.memoryAwareRun(); SequenceFile.Reader reader; CrossProductExtraKey groupKey = new CrossProductExtraKey(); CrossProductPair data = new CrossProductPair(); Text txt = new Text(); Text txt2 = new Text(); if (twoSteps) { reader = new SequenceFile.Reader(fS, new Path(OUTPUT, "EXTRA" + "/" + "part-r-00000"), conf); /* * Assert intermediate "big groups" output */ for (int i = 0; i < 9; i++) { reader.next(groupKey); reader.getCurrentValue(data); if (i < 3) { ser.deser(txt, data.getRight()); switch (i) { case 0: assertEquals(txt.toString(), "pere"); break; case 1: assertEquals(txt.toString(), "eric"); break; case 2: assertEquals(txt.toString(), "ivan"); break; } } else { ser.deser(txt, data.getLeft()); switch (i) { case 3: assertEquals(txt.toString(), "beer"); break; case 4: assertEquals(txt.toString(), "beer"); break; case 5: assertEquals(txt.toString(), "beer"); break; case 6: assertEquals(txt.toString(), "wine"); break; case 7: assertEquals(txt.toString(), "wine"); break; case 8: assertEquals(txt.toString(), "wine"); break; } } } reader.close(); } /* * Assert final output */ Counter count = Counter.createWithDistinctElements(); Path finalOutput = new Path(OUTPUT, "part-r-00000"); if (twoSteps) { finalOutput = new Path(crossProduct.getBigGroupsOutput(), "part-r-00000"); } reader = new SequenceFile.Reader(fS, finalOutput, conf); for (int i = 0; i < 6; i++) { reader.next(data); ser.deser(txt, data.getLeft()); ser.deser(txt2, data.getRight()); count.in(txt.toString()).count(txt2.toString()); } Count counts = count.getCounts(); List<String> beerResults = counts.get("beer").getDistinctListAsStringList(); List<String> wineResults = counts.get("wine").getDistinctListAsStringList(); for (List<String> list : new List[] { beerResults, wineResults }) { assertEquals(list.contains("pere"), true); assertEquals(list.contains("ivan"), true); assertEquals(list.contains("eric"), true); } HadoopUtils.deleteIfExists(fS, new Path(INPUT_1)); HadoopUtils.deleteIfExists(fS, new Path(INPUT_2)); HadoopUtils.deleteIfExists(fS, new Path(OUTPUT)); if (twoSteps) { HadoopUtils.deleteIfExists(fS, crossProduct.getBigGroupsOutput()); } }
From source file:com.datascience.cascading.scheme.CsvScheme.java
License:Apache License
@Override @SuppressWarnings("unchecked") public boolean source(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) throws TapException, IOException { Object[] context = sourceCall.getContext(); if (!sourceCall.getInput().next(context[0], context[1])) { return false; }/* w w w . ja va2 s. c om*/ TupleEntry entry = sourceCall.getIncomingEntry(); ListWritable<Text> values = (ListWritable<Text>) context[1]; Fields fields = getSourceFields(); if ((format.getHeader() != null) && !areFieldsInFormatHeaders(fields)) { try { LongWritable pos = (LongWritable) context[0]; Long position = pos.get(); String message = String.format("%s: %s", "Failed to parse record. fields not in header record at position: ", position); LOGGER.warn(message); if (strict) { throw new CsvParseException(message); } else { return true; } } catch (CsvParseException e) { throw new TapException(e); } } int check = strict ? fields.size() : values.size() != fields.size() ? values.size() : fields.size(); int checkDiff = check - fields.size(); for (int i = 0; i < (checkDiff < 1 ? fields.size() : values.size()); i++) { int index = indices != null && checkDiff < 1 ? indices.get(fields.get(i).toString()) : i; //fill empty values with null for records missing values Text value = values.size() - i < 1 ? null : values.get(index); if (value == null) { entry.setString(i, null); } else { try { entry.setString(i, value.toString()); } catch (Exception e) { if (!strict) { Tuple tuple = new Tuple(); for (Text val : values) { tuple.addString(val.toString()); } throw new TapException(e.getMessage(), e, tuple); } else { return false; } } } } return true; }
From source file:com.davidgildeh.hadoop.input.simpledb.SimpleDBRecordReader.java
License:Apache License
/** * Get next Key/Value Record (Tuple) from the Split * /* w w w.ja v a2 s .co m*/ * @param key The key to set * @param value The HashMap value to set * @return True - next Item available, False - No more items available * @throws IOException */ public boolean next(Text key, MapWritable value) throws IOException { // Get next item off the ArrayList unless we're at the end if (cursor < split.getLength()) { Item item = items.get(cursor++); key.set(item.getName()); for (Attribute attribute : item.getAttributes()) { value.put(new Text(attribute.getName()), new Text(attribute.getValue())); } if (LOG.isDebugEnabled()) { LOG.debug("Sending next record to Mappers: " + key.toString()); } return true; } else { return false; } }
From source file:com.digitalpebble.behemoth.gate.GATECorpusGenerator.java
License:Apache License
private void generateXMLdocs(Path input, File dir, int[] count) throws IOException { Configuration conf = BehemothConfiguration.create(); Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(), input); for (Reader current : cacheReaders) { // read the key + values in that file Text key = new Text(); BehemothDocument inputDoc = new BehemothDocument(); BufferedWriter writer = null; gate.Document gatedocument = null; while (current.next(key, inputDoc)) { count[0]++;//from w ww. j av a 2s. co m // generate a GATE document then save it to XML try { // first put the text GATEProcessor gp = new GATEProcessor(new URL("http://dummy.com")); gp.setConfig(conf); gatedocument = gp.generateGATEDoc(inputDoc); // then save as XML File outputFile = new File(dir, count[0] + ".xml"); if (outputFile.exists() == false) outputFile.createNewFile(); writer = new BufferedWriter(new FileWriter(outputFile)); writer.write(gatedocument.toXml()); } catch (Exception e) { LOG.error("Exception on doc [" + count[0] + "] " + key.toString(), e); } finally { if (writer != null) writer.close(); if (gatedocument != null) Factory.deleteResource(gatedocument); } } current.close(); } }
From source file:com.digitalpebble.behemoth.io.nutch.NutchSegmentConverterJob.java
License:Apache License
public void map(Text key, Content content, OutputCollector<Text, BehemothDocument> output, Reporter reporter) throws IOException { BehemothDocument behemothDocument = new BehemothDocument(); int status = Integer.parseInt(content.getMetadata().get(Nutch.FETCH_STATUS_KEY)); if (status != CrawlDatum.STATUS_FETCH_SUCCESS) { // content not fetched successfully, skip document LOG.debug("Skipping " + key + " as content is not fetched successfully"); return;/*from w ww . j a v a 2 s. c o m*/ } // TODO store the fetch metadata in the Behemoth document // store the binary content and mimetype in the Behemoth document String contentType = content.getContentType(); byte[] binarycontent = content.getContent(); behemothDocument.setUrl(key.toString()); behemothDocument.setContent(binarycontent); behemothDocument.setContentType(contentType); output.collect(key, behemothDocument); }
From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException { String sContent = value.getText(); if (sContent == null) { // no text available? skip context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1); return;// w w w.j av a 2s.c o m } TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(sContent.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringTuple document = new StringTuple(); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } context.write(key, document); }
From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("v", "vector", true, "input vector sequencefile"); options.addOption("l", "label", true, "input vector sequencefile"); options.addOption("o", "output", true, "output Behemoth corpus"); // parse the command line arguments CommandLine line = null;//from ww w. j a v a 2 s . co m try { line = parser.parse(options, args); if (line.hasOption("help")) { formatter.printHelp("CorpusGenerator", options); return 0; } if (!line.hasOption("v") | !line.hasOption("o") | !line.hasOption("l")) { formatter.printHelp("CorpusGenerator", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusGenerator", options); } Path vectorPath = new Path(line.getOptionValue("v")); Path labelPath = new Path(line.getOptionValue("l")); String output = line.getOptionValue("o"); Path tempOutput = new Path(vectorPath.getParent(), "temp-" + System.currentTimeMillis()); // extracts the string representations from the vectors int retVal = vectorToString(vectorPath, tempOutput); if (retVal != 0) { HadoopUtil.delete(getConf(), tempOutput); return retVal; } Path tempOutput2 = new Path(vectorPath.getParent(), "temp-" + System.currentTimeMillis()); retVal = convert(tempOutput, labelPath, tempOutput2); // delete the temp output HadoopUtil.delete(getConf(), tempOutput); if (retVal != 0) { HadoopUtil.delete(getConf(), tempOutput2); return retVal; } // convert tempOutput to standard file BufferedWriter bow = new BufferedWriter(new FileWriter(new File(output))); // the label dictionary is not dumped to text int labelMaxIndex = 0; Map<String, Integer> labelIndex = new HashMap<String, Integer>(); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); FileStatus[] fss = fs.listStatus(tempOutput2); try { for (FileStatus status : fss) { Path path = status.getPath(); // skips the _log or _SUCCESS files if (!path.getName().startsWith("part-") && !path.getName().equals(tempOutput2.getName())) continue; SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // read the key + values in that file Text key = new Text(); Text value = new Text(); while (reader.next(key, value)) { String label = key.toString(); // replace the label by its index Integer indexLabel = labelIndex.get(label); if (indexLabel == null) { indexLabel = new Integer(labelMaxIndex); labelIndex.put(label, indexLabel); labelMaxIndex++; } String val = value.toString(); bow.append(indexLabel.toString()).append(val).append("\n"); } reader.close(); } bow.flush(); } catch (Exception e) { e.printStackTrace(); return -1; } finally { bow.close(); fs.delete(tempOutput2, true); } return 0; }
From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java
License:Apache License
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { // should have two values : a vector and a label String label = null;/*ww w . ja v a2s.c o m*/ String attributes = null; while (values.hasNext()) { String t = values.next().toString(); if (t.startsWith("VECTOR_")) { attributes = t.substring(7); } else label = t; } if (label == null) { log.info(key.toString() + " does not have label"); } else if (attributes == null) { log.info(key.toString() + " does not have attributes"); } else { output.collect(new Text(label), new Text(attributes)); } }
From source file:com.digitalpebble.behemoth.uima.UIMAMapper.java
License:Apache License
public void map(Text id, BehemothDocument behemoth, OutputCollector<Text, BehemothDocument> output, Reporter reporter) throws IOException { reporter.setStatus("UIMA : " + id.toString()); // generate a CAS from the input document cas.reset();/*w w w . j a v a2 s .c o m*/ try { // does the input document have a some text? // if not - skip it if (behemoth.getText() == null) { LOG.debug(behemoth.getUrl().toString() + " has null text"); } else { // detect language if specified by user String lang = this.config.get("uima.language", "en"); cas.setDocumentLanguage(lang); cas.setDocumentText(behemoth.getText()); // process it tae.process(cas); convertCASToBehemoth(cas, behemoth, reporter); } } catch (Exception e) { reporter.incrCounter("UIMA", "Exception", 1); throw new IOException(e); } reporter.incrCounter("UIMA", "Document", 1); // dump the modified document output.collect(id, behemoth); }