Example usage for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString()

Source Link

Document

Convert text back to string

Usage

From source file:com.datasalt.pangool.tuplemr.serialization.TestTupleFieldSerialization.java

License:Apache License

@SuppressWarnings("deprecation")
@Test/*from  w  w w.j  av a 2  s  .c  o m*/
public void test() throws Exception {
    initHadoop();
    trash(INPUT1, INPUT2, OUTPUT);

    // Prepare input
    BufferedWriter writer;

    // INPUT1
    writer = new BufferedWriter(new FileWriter(INPUT1));
    writer.write("foo1" + "\t" + "30" + "\n");
    writer.write("foo2" + "\t" + "20" + "\n");
    writer.write("foo3" + "\t" + "140" + "\n");
    writer.write("foo4" + "\t" + "110" + "\n");
    writer.write("foo5" + "\t" + "220" + "\n");
    writer.write("foo6" + "\t" + "260" + "\n");
    writer.close();

    // INPUT2
    writer = new BufferedWriter(new FileWriter(INPUT2));
    writer.write("4.5" + "\t" + "true" + "\n");
    writer.write("4.6" + "\t" + "false" + "\n");
    writer.close();

    TupleMRBuilder builder = new TupleMRBuilder(getConf());

    final Schema tupleSchema1 = new Schema("tupleSchema1", Fields.parse("a:string, b:int"));
    final Schema tupleSchema2 = new Schema("tupleSchema2", Fields.parse("c:double, d:boolean"));

    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("partitionId", Type.INT));
    fields.add(Fields.createTupleField("tuple1", tupleSchema1));
    final Schema schema1 = new Schema("tupleInTuple1", fields);

    fields.clear();
    fields.add(Field.create("partitionId", Type.INT));
    fields.add(Fields.createTupleField("tuple2", tupleSchema2));
    final Schema schema2 = new Schema("tupleInTuple2", fields);

    builder.addIntermediateSchema(schema1);
    builder.addIntermediateSchema(schema2);

    builder.addInput(new Path(INPUT1), new HadoopInputFormat(TextInputFormat.class),
            new TupleMapper<LongWritable, Text>() {

                ITuple tupleInTuple1 = new Tuple(schema1);
                ITuple tuple1 = new Tuple(tupleSchema1);

                @Override
                public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
                        throws IOException, InterruptedException {
                    String[] split = value.toString().split("\t");
                    tuple1.set("a", split[0]);
                    tuple1.set("b", Integer.parseInt(split[1]));

                    tupleInTuple1.set("partitionId", 0);
                    tupleInTuple1.set("tuple1", tuple1);
                    collector.write(tupleInTuple1);
                }
            });

    builder.addInput(new Path(INPUT2), new HadoopInputFormat(TextInputFormat.class),
            new TupleMapper<LongWritable, Text>() {

                ITuple tupleInTuple2 = new Tuple(schema2);
                ITuple tuple2 = new Tuple(tupleSchema2);

                @Override
                public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
                        throws IOException, InterruptedException {
                    String[] split = value.toString().split("\t");
                    tuple2.set("c", Double.parseDouble(split[0]));
                    tuple2.set("d", Boolean.parseBoolean(split[1]));

                    tupleInTuple2.set("partitionId", 0);
                    tupleInTuple2.set("tuple2", tuple2);
                    collector.write(tupleInTuple2);
                }
            });

    builder.setTupleReducer(new TupleReducer<Text, NullWritable>() {

        public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
                throws IOException, InterruptedException, TupleMRException {

            Iterator<ITuple> iterator = tuples.iterator();
            ITuple currentTuple;

            assertEquals(0, group.get("partitionId"));

            currentTuple = iterator.next();
            assertEquals("foo1", ((ITuple) currentTuple.get("tuple1")).get("a").toString());
            assertEquals(30, ((ITuple) currentTuple.get("tuple1")).get("b"));

            currentTuple = iterator.next();
            assertEquals("foo2", ((ITuple) currentTuple.get("tuple1")).get("a").toString());
            assertEquals(20, ((ITuple) currentTuple.get("tuple1")).get("b"));

            currentTuple = iterator.next();
            assertEquals("foo3", ((ITuple) currentTuple.get("tuple1")).get("a").toString());
            assertEquals(140, ((ITuple) currentTuple.get("tuple1")).get("b"));

            currentTuple = iterator.next();
            assertEquals("foo4", ((ITuple) currentTuple.get("tuple1")).get("a").toString());
            assertEquals(110, ((ITuple) currentTuple.get("tuple1")).get("b"));

            currentTuple = iterator.next();
            assertEquals("foo5", ((ITuple) currentTuple.get("tuple1")).get("a").toString());
            assertEquals(220, ((ITuple) currentTuple.get("tuple1")).get("b"));

            currentTuple = iterator.next();
            assertEquals("foo6", ((ITuple) currentTuple.get("tuple1")).get("a").toString());
            assertEquals(260, ((ITuple) currentTuple.get("tuple1")).get("b"));

            // Second data source BEGINS
            currentTuple = iterator.next();
            assertEquals(4.5, ((ITuple) currentTuple.get("tuple2")).get("c"));
            assertEquals(true, ((ITuple) currentTuple.get("tuple2")).get("d"));

            currentTuple = iterator.next();
            assertEquals(4.6, ((ITuple) currentTuple.get("tuple2")).get("c"));
            assertEquals(false, ((ITuple) currentTuple.get("tuple2")).get("d"));
        };
    });
    builder.setGroupByFields("partitionId");
    builder.setOutput(new Path(OUTPUT), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
            NullWritable.class);
    Job job = builder.createJob();
    try {
        job.waitForCompletion(true);
    } finally {
        builder.cleanUpInstanceFiles();
    }
    trash(INPUT1, INPUT2, OUTPUT);
}

From source file:com.datasalt.utils.mapred.crossproduct.TestCrossProductMapRed.java

License:Apache License

public void test(boolean twoSteps) throws Exception {
    createFirstDataSet();//from w  w  w.ja  v a  2s.c  om
    createSecondDataSet();

    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);

    if (twoSteps) {
        /*
         * Here we are saying that 1-list elements is enough to collapse the JVM's heap - just for testing 
         */
        conf.setInt(CrossProductMapRed.SPLIT_DATASET_SIZE_CONF, 1);
    }

    CrossProductMapRed crossProduct = new CrossProductMapRed("Test", conf);
    crossProduct.setLeftInputPath(new Path(INPUT_1));
    crossProduct.setLeftInputFormat(TextInputFormat.class);
    crossProduct.setLeftInputMapper(Map.class);

    crossProduct.setRightInputPath(new Path(INPUT_2));
    crossProduct.setRightInputFormat(TextInputFormat.class);
    crossProduct.setRightInputMapper(Map.class);

    crossProduct.setOutputPath(new Path(OUTPUT));
    crossProduct.setOutputFormat(SequenceFileOutputFormat.class);

    crossProduct.memoryAwareRun();
    SequenceFile.Reader reader;
    CrossProductExtraKey groupKey = new CrossProductExtraKey();
    CrossProductPair data = new CrossProductPair();
    Text txt = new Text();
    Text txt2 = new Text();
    if (twoSteps) {

        reader = new SequenceFile.Reader(fS, new Path(OUTPUT, "EXTRA" + "/" + "part-r-00000"), conf);

        /*
         * Assert intermediate "big groups" output
         */
        for (int i = 0; i < 9; i++) {
            reader.next(groupKey);
            reader.getCurrentValue(data);

            if (i < 3) {
                ser.deser(txt, data.getRight());
                switch (i) {
                case 0:
                    assertEquals(txt.toString(), "pere");
                    break;
                case 1:
                    assertEquals(txt.toString(), "eric");
                    break;
                case 2:
                    assertEquals(txt.toString(), "ivan");
                    break;
                }
            } else {
                ser.deser(txt, data.getLeft());
                switch (i) {
                case 3:
                    assertEquals(txt.toString(), "beer");
                    break;
                case 4:
                    assertEquals(txt.toString(), "beer");
                    break;
                case 5:
                    assertEquals(txt.toString(), "beer");
                    break;
                case 6:
                    assertEquals(txt.toString(), "wine");
                    break;
                case 7:
                    assertEquals(txt.toString(), "wine");
                    break;
                case 8:
                    assertEquals(txt.toString(), "wine");
                    break;
                }
            }
        }

        reader.close();
    }

    /*
     * Assert final output
     */

    Counter count = Counter.createWithDistinctElements();

    Path finalOutput = new Path(OUTPUT, "part-r-00000");
    if (twoSteps) {
        finalOutput = new Path(crossProduct.getBigGroupsOutput(), "part-r-00000");
    }
    reader = new SequenceFile.Reader(fS, finalOutput, conf);

    for (int i = 0; i < 6; i++) {
        reader.next(data);
        ser.deser(txt, data.getLeft());
        ser.deser(txt2, data.getRight());
        count.in(txt.toString()).count(txt2.toString());
    }

    Count counts = count.getCounts();
    List<String> beerResults = counts.get("beer").getDistinctListAsStringList();
    List<String> wineResults = counts.get("wine").getDistinctListAsStringList();
    for (List<String> list : new List[] { beerResults, wineResults }) {
        assertEquals(list.contains("pere"), true);
        assertEquals(list.contains("ivan"), true);
        assertEquals(list.contains("eric"), true);
    }

    HadoopUtils.deleteIfExists(fS, new Path(INPUT_1));
    HadoopUtils.deleteIfExists(fS, new Path(INPUT_2));
    HadoopUtils.deleteIfExists(fS, new Path(OUTPUT));
    if (twoSteps) {
        HadoopUtils.deleteIfExists(fS, crossProduct.getBigGroupsOutput());
    }
}

From source file:com.datascience.cascading.scheme.CsvScheme.java

License:Apache License

@Override
@SuppressWarnings("unchecked")
public boolean source(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall)
        throws TapException, IOException {
    Object[] context = sourceCall.getContext();
    if (!sourceCall.getInput().next(context[0], context[1])) {
        return false;
    }/*  w  w  w  . ja va2  s. c  om*/

    TupleEntry entry = sourceCall.getIncomingEntry();

    ListWritable<Text> values = (ListWritable<Text>) context[1];

    Fields fields = getSourceFields();
    if ((format.getHeader() != null) && !areFieldsInFormatHeaders(fields)) {
        try {
            LongWritable pos = (LongWritable) context[0];
            Long position = pos.get();
            String message = String.format("%s: %s",
                    "Failed to parse record. fields not in header record at position: ", position);
            LOGGER.warn(message);
            if (strict) {
                throw new CsvParseException(message);
            } else {
                return true;
            }
        } catch (CsvParseException e) {
            throw new TapException(e);
        }
    }
    int check = strict ? fields.size() : values.size() != fields.size() ? values.size() : fields.size();
    int checkDiff = check - fields.size();
    for (int i = 0; i < (checkDiff < 1 ? fields.size() : values.size()); i++) {
        int index = indices != null && checkDiff < 1 ? indices.get(fields.get(i).toString()) : i;

        //fill empty values with null for records missing values
        Text value = values.size() - i < 1 ? null : values.get(index);

        if (value == null) {
            entry.setString(i, null);
        } else {
            try {
                entry.setString(i, value.toString());
            } catch (Exception e) {
                if (!strict) {
                    Tuple tuple = new Tuple();
                    for (Text val : values) {
                        tuple.addString(val.toString());
                    }
                    throw new TapException(e.getMessage(), e, tuple);
                } else {
                    return false;
                }
            }
        }
    }
    return true;
}

From source file:com.davidgildeh.hadoop.input.simpledb.SimpleDBRecordReader.java

License:Apache License

/**
 * Get next Key/Value Record (Tuple) from the Split
 * /* w w  w.ja  v a2 s  .co  m*/
 * @param key           The key to set
 * @param value         The HashMap value to set
 * @return              True - next Item available, False - No more items available
 * @throws IOException 
 */
public boolean next(Text key, MapWritable value) throws IOException {

    // Get next item off the ArrayList unless we're at the end
    if (cursor < split.getLength()) {

        Item item = items.get(cursor++);

        key.set(item.getName());
        for (Attribute attribute : item.getAttributes()) {
            value.put(new Text(attribute.getName()), new Text(attribute.getValue()));
        }

        if (LOG.isDebugEnabled()) {
            LOG.debug("Sending next record to Mappers: " + key.toString());
        }

        return true;
    } else {
        return false;
    }
}

From source file:com.digitalpebble.behemoth.gate.GATECorpusGenerator.java

License:Apache License

private void generateXMLdocs(Path input, File dir, int[] count) throws IOException {

    Configuration conf = BehemothConfiguration.create();
    Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(), input);
    for (Reader current : cacheReaders) {
        // read the key + values in that file
        Text key = new Text();
        BehemothDocument inputDoc = new BehemothDocument();
        BufferedWriter writer = null;
        gate.Document gatedocument = null;
        while (current.next(key, inputDoc)) {
            count[0]++;//from w  ww.  j  av  a 2s. co  m
            // generate a GATE document then save it to XML
            try {
                // first put the text
                GATEProcessor gp = new GATEProcessor(new URL("http://dummy.com"));
                gp.setConfig(conf);
                gatedocument = gp.generateGATEDoc(inputDoc);

                // then save as XML
                File outputFile = new File(dir, count[0] + ".xml");
                if (outputFile.exists() == false)
                    outputFile.createNewFile();

                writer = new BufferedWriter(new FileWriter(outputFile));
                writer.write(gatedocument.toXml());

            } catch (Exception e) {
                LOG.error("Exception on doc [" + count[0] + "] " + key.toString(), e);
            } finally {
                if (writer != null)
                    writer.close();
                if (gatedocument != null)
                    Factory.deleteResource(gatedocument);
            }
        }
        current.close();
    }
}

From source file:com.digitalpebble.behemoth.io.nutch.NutchSegmentConverterJob.java

License:Apache License

public void map(Text key, Content content, OutputCollector<Text, BehemothDocument> output, Reporter reporter)
        throws IOException {

    BehemothDocument behemothDocument = new BehemothDocument();

    int status = Integer.parseInt(content.getMetadata().get(Nutch.FETCH_STATUS_KEY));
    if (status != CrawlDatum.STATUS_FETCH_SUCCESS) {
        // content not fetched successfully, skip document
        LOG.debug("Skipping " + key + " as content is not fetched successfully");
        return;/*from w  ww  . j  a  v  a 2  s. c o  m*/
    }

    // TODO store the fetch metadata in the Behemoth document
    // store the binary content and mimetype in the Behemoth document

    String contentType = content.getContentType();
    byte[] binarycontent = content.getContent();
    behemothDocument.setUrl(key.toString());
    behemothDocument.setContent(binarycontent);
    behemothDocument.setContentType(contentType);
    output.collect(key, behemothDocument);
}

From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException {
    String sContent = value.getText();
    if (sContent == null) {
        // no text available? skip
        context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1);
        return;//  w  w w.j  av  a  2s.c  o m
    }
    TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(sContent.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    context.write(key, document);
}

From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("v", "vector", true, "input vector sequencefile");
    options.addOption("l", "label", true, "input vector sequencefile");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;//from   ww w.  j a  v a  2 s .  co  m
    try {
        line = parser.parse(options, args);
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusGenerator", options);
            return 0;
        }
        if (!line.hasOption("v") | !line.hasOption("o") | !line.hasOption("l")) {
            formatter.printHelp("CorpusGenerator", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusGenerator", options);
    }

    Path vectorPath = new Path(line.getOptionValue("v"));
    Path labelPath = new Path(line.getOptionValue("l"));
    String output = line.getOptionValue("o");

    Path tempOutput = new Path(vectorPath.getParent(), "temp-" + System.currentTimeMillis());

    // extracts the string representations from the vectors
    int retVal = vectorToString(vectorPath, tempOutput);
    if (retVal != 0) {
        HadoopUtil.delete(getConf(), tempOutput);
        return retVal;
    }

    Path tempOutput2 = new Path(vectorPath.getParent(), "temp-" + System.currentTimeMillis());

    retVal = convert(tempOutput, labelPath, tempOutput2);

    // delete the temp output
    HadoopUtil.delete(getConf(), tempOutput);

    if (retVal != 0) {
        HadoopUtil.delete(getConf(), tempOutput2);
        return retVal;
    }

    // convert tempOutput to standard file
    BufferedWriter bow = new BufferedWriter(new FileWriter(new File(output)));

    // the label dictionary is not dumped to text
    int labelMaxIndex = 0;
    Map<String, Integer> labelIndex = new HashMap<String, Integer>();

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] fss = fs.listStatus(tempOutput2);
    try {
        for (FileStatus status : fss) {
            Path path = status.getPath();
            // skips the _log or _SUCCESS files
            if (!path.getName().startsWith("part-") && !path.getName().equals(tempOutput2.getName()))
                continue;
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
            // read the key + values in that file
            Text key = new Text();
            Text value = new Text();
            while (reader.next(key, value)) {
                String label = key.toString();
                // replace the label by its index
                Integer indexLabel = labelIndex.get(label);
                if (indexLabel == null) {
                    indexLabel = new Integer(labelMaxIndex);
                    labelIndex.put(label, indexLabel);
                    labelMaxIndex++;
                }
                String val = value.toString();
                bow.append(indexLabel.toString()).append(val).append("\n");
            }
            reader.close();
        }
        bow.flush();
    } catch (Exception e) {
        e.printStackTrace();
        return -1;
    } finally {
        bow.close();
        fs.delete(tempOutput2, true);
    }
    return 0;
}

From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java

License:Apache License

public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
    // should have two values : a vector and a label
    String label = null;/*ww  w  .  ja  v a2s.c o m*/
    String attributes = null;
    while (values.hasNext()) {
        String t = values.next().toString();
        if (t.startsWith("VECTOR_")) {
            attributes = t.substring(7);
        } else
            label = t;
    }
    if (label == null) {
        log.info(key.toString() + " does not have label");
    } else if (attributes == null) {
        log.info(key.toString() + " does not have attributes");
    } else {
        output.collect(new Text(label), new Text(attributes));
    }
}

From source file:com.digitalpebble.behemoth.uima.UIMAMapper.java

License:Apache License

public void map(Text id, BehemothDocument behemoth, OutputCollector<Text, BehemothDocument> output,
        Reporter reporter) throws IOException {

    reporter.setStatus("UIMA : " + id.toString());

    // generate a CAS from the input document
    cas.reset();/*w  w  w  .  j a  v  a2  s .c o m*/

    try {
        // does the input document have a some text?
        // if not - skip it
        if (behemoth.getText() == null) {
            LOG.debug(behemoth.getUrl().toString() + " has null text");
        } else {
            // detect language if specified by user
            String lang = this.config.get("uima.language", "en");
            cas.setDocumentLanguage(lang);
            cas.setDocumentText(behemoth.getText());
            // process it
            tae.process(cas);
            convertCASToBehemoth(cas, behemoth, reporter);
        }
    } catch (Exception e) {
        reporter.incrCounter("UIMA", "Exception", 1);
        throw new IOException(e);
    }

    reporter.incrCounter("UIMA", "Document", 1);

    // dump the modified document
    output.collect(id, behemoth);
}