Example usage for org.apache.hadoop.io Text toString

List of usage examples for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString() 

Source Link

Document

Convert text back to string

Usage

From source file:com.moz.fiji.mapreduce.lib.bulkimport.DescribedInputTextBulkImporter.java

License:Apache License

/**
 * Post-processes rejected lines(Logging, keeping count, etc).
 *
 * @param line the line that was rejected by the producer.
 * @param context the context in which the rejection occured.
 * @param reason the reason why this line was rejected.
 *//*ww  w  . ja v a 2s .c  o  m*/
public void reject(Text line, FijiTableContext context, String reason) {
    if (mRejectedLineCounter % mLogRate == 0L) {
        LOG.error("Rejecting line: {} with reason: {}", line.toString(), reason);
    }
    mRejectedLineCounter++;

    //TODO(FIJIMRLIB-9) Abort this bulk importer job early if rejected records exceed a threshold
    context.incrementCounter(JobHistoryCounters.BULKIMPORTER_RECORDS_REJECTED);

    //TODO(FIJIMRLIB-4) Allow this to emit to a rejected output so that import can be reattempted.
}

From source file:com.moz.fiji.mapreduce.lib.bulkimport.JSONBulkImporter.java

License:Apache License

/** {@inheritDoc} */
@Override//from www .  j a va2  s.c  o m
public void produce(Text value, FijiTableContext context) throws IOException {
    JsonObject gson = new JsonParser().parse(value.toString()).getAsJsonObject();

    for (FijiColumnName fijiColumnName : getDestinationColumns()) {
        String entityIdSource = getFromPath(gson, getEntityIdSource());
        if (entityIdSource == null) {
            LOG.error("Unable to retrieve entityId from source field: " + getEntityIdSource());
            return;
        }
        final EntityId eid = context.getEntityId(entityIdSource);
        String source = getSource(fijiColumnName);
        String fieldValue = getFromPath(gson, source);
        if (fieldValue != null) {
            String family = fijiColumnName.getFamily();
            String qualifier = fijiColumnName.getQualifier();
            if (isOverrideTimestamp()) {
                // Override the timestamp from the imported source
                String timestampSource = getFromPath(gson, getTimestampSource());
                Long timestamp = Long.parseLong(timestampSource);
                context.put(eid, family, qualifier, timestamp, convert(fijiColumnName, fieldValue));
            } else {
                // Use the system time as the timestamp
                context.put(eid, family, qualifier, convert(fijiColumnName, fieldValue));
            }
        } else {
            incomplete(value, context, "Detected missing field: " + source);
        }
    }
}

From source file:com.moz.fiji.mapreduce.lib.bulkimport.XMLBulkImporter.java

License:Apache License

/** {@inheritDoc} */
@Override//from   ww  w .j  av a2s . c o m
public void produce(Text xmlText, FijiTableContext context) throws IOException {
    String xml = xmlText.toString();
    // Prepare the document builder and XPath.
    DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
    DocumentBuilder documentBuilder = null;
    Document document = null;
    XPathFactory xPathFactory = XPathFactory.newInstance();
    XPath xPath = xPathFactory.newXPath();
    try {
        // Parse the record as an XML document
        documentBuilder = documentBuilderFactory.newDocumentBuilder();
        InputSource inputSource = new InputSource();
        inputSource.setCharacterStream(new StringReader(xml));
        try {
            document = documentBuilder.parse(inputSource);
        } catch (SAXException saxe) {
            reject(xmlText, context, "Failed to parse XML.");
            return;
        }
    } catch (ParserConfigurationException pce) {
        // Should be unreachable, since the default configuration is used.
        reject(xmlText, context, "Invalid parser configuration.");
        return;
    }

    // Get the entityId.
    String entityIdStr = null;
    try {
        entityIdStr = xPath.compile(getEntityIdSource()).evaluate(document);
        if (entityIdStr == null || entityIdStr.isEmpty()) {
            reject(xmlText, context, "Unable to retrieve entityId from source field.");
            return;
        }
    } catch (XPathExpressionException xpee) {
        // Should be unreachable, errors caught in setupImporter().
        LOG.error("Invalid XPath expression: " + getEntityIdSource());
        throw new RuntimeException("Invalid XPath expression: " + getEntityIdSource());
    }

    final EntityId eid = context.getEntityId(entityIdStr);

    // Get the timestamp.
    Long timestamp = null;
    if (isOverrideTimestamp()) {
        String timestampSource = null;
        try {
            timestampSource = xPath.compile(getTimestampSource()).evaluate(document);
        } catch (XPathExpressionException xpee) {
            // Should be unreachable, errors caught in setupImporter().
            LOG.error("Invalid XPath expression: " + getTimestampSource());
            throw new RuntimeException("Invalid XPath expression: " + getTimestampSource());
        }
        try {
            final int time = 0;
            timestamp = Long.parseLong(timestampSource);
        } catch (NumberFormatException nfe) {
            incomplete(xmlText, context, "Detected missing field: " + getTimestampSource());
        }
    } else {
        // If timestamp is not overridden in the import descriptor, use the current system time for
        // all writes to this row.
        timestamp = System.currentTimeMillis();
    }

    // For each output column, traverse the XML document with XPath and write the data.
    for (FijiColumnName fijiColumnName : getDestinationColumns()) {

        String source = getSource(fijiColumnName);
        String fieldValue = null;
        try {
            fieldValue = xPath.compile(source).evaluate(document);
        } catch (XPathExpressionException xpee) {
            // Should be unreachable, errors caught in setupImporter().
            LOG.error("Invalid XPath expression: " + source);
            throw new RuntimeException("Invalid XPath expression: " + source);
        }
        if (fieldValue != null && !fieldValue.isEmpty()) {
            String family = fijiColumnName.getFamily();
            String qualifier = fijiColumnName.getQualifier();
            context.put(eid, family, qualifier, timestamp, convert(fijiColumnName, fieldValue));
        } else {
            incomplete(xmlText, context, "Detected missing field: " + source);
        }
    }
}

From source file:com.moz.fiji.mapreduce.lib.examples.News20BulkImporter.java

License:Apache License

/**
 * Reads a single news article, and writes its contents to a new fiji row,
 * indexed by the article's name (A string consisting of the parent folder, and
 * this article's hash), and the a priori categorization of this article.
 *
 * @param key The fully qualified path to the current file we're reading.
 * @param value The raw data to insert into this column.
 * @param context The context to write to.
 * @throws IOException if there is an error.
 *//* ww  w .  ja  v a 2 s .  co  m*/
@Override
public void produce(Text key, Text value, FijiTableContext context) throws IOException {
    Path qualifiedPath = new Path(key.toString());

    // Category is specified on the containing folder.
    String category = qualifiedPath.getParent().getName();
    // Name is the concatenation of category and file name.
    String name = category + "." + qualifiedPath.getName();

    // write name, category, and raw article.
    EntityId entity = context.getEntityId(name);
    context.put(entity, FAMILY, ARTICLE_NAME_QUALIFIER, name);
    context.put(entity, FAMILY, CATEGORY_QUALIFIER, category);
    context.put(entity, FAMILY, RAW_ARTICLE_QUALIFIER, value.toString());
}

From source file:com.moz.fiji.mapreduce.lib.examples.TextListReducer.java

License:Apache License

/** {@inheritDoc} */
@Override/*from   w w  w .  ja v a 2s  .  c  o  m*/
public void reduce(K key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    // Construct a list to hold all of the values in the group.
    List<CharSequence> aggregateList = new ArrayList<CharSequence>();

    // Add each input value to our aggregate list.
    for (Text value : values) {
        aggregateList.add(value.toString());
    }

    // Put the aggregate list into the wrapper AvroValue to be written.
    mValue.datum(aggregateList);

    // Write the output key and its aggregate list of values.
    context.write(key, mValue);
}

From source file:com.moz.fiji.mapreduce.lib.map.TextFlatteningMapper.java

License:Apache License

/**
 * Converts The Bytes stored in fileContents to a single String with all new-line
 * characters removed./*from www.  j  a v  a2s  .c o m*/
 *
 * @param fileName The qualified path to this file.
 * @param fileContents The file to convert, encoded in UTF8.
 * @param context The Context to write to.
 * @throws IOException if there is an error.
 * @throws InterruptedException if there is an error.
 */
@Override
protected void map(Text fileName, Text fileContents, Context context) throws IOException, InterruptedException {
    // Run over file and remove each newline character.
    // These files are expected to be small (and already fit in a Text object)
    // so we should be able to toString() them.
    String text = fileContents.toString();

    // Replace all newlines with spaces.
    String withoutNewlines = text.replaceAll("\n", " ");
    mFlattenedFile.set(withoutNewlines);

    context.write(fileName, mFlattenedFile);
}

From source file:com.moz.fiji.mapreduce.testlib.SimpleBulkImporter.java

License:Apache License

/** {@inheritDoc} */
@Override/* ww w  .  j  a  v  a 2s .c  om*/
public void produce(LongWritable filePos, Text value, FijiTableContext context) throws IOException {
    final String line = value.toString();
    final String[] split = line.split(":");
    Preconditions.checkState(split.length == 2,
            String.format("Unable to parse bulk-import test input line: '%s'.", line));
    final String rowKey = split[0];
    final int integerValue = Integer.parseInt(split[1]);

    final EntityId eid = context.getEntityId(rowKey);
    context.put(eid, "primitives", "int", integerValue);
    context.put(eid, "primitives", "long", filePos.get());
    context.put(eid, "primitives", "string", String.format("%s-%d", rowKey, integerValue));
}

From source file:com.moz.fiji.schema.filter.TestRegexQualifierColumnFilter.java

License:Apache License

@Test
public void testRegexQualifierColumnFilter() throws Exception {
    final File outputDir = File.createTempFile("gatherer-output", ".dir", getLocalTempDir());
    Preconditions.checkState(outputDir.delete());
    final int numSplits = 1;

    // Run a gatherer over the test_table.
    final FijiMapReduceJob gatherJob = FijiGatherJobBuilder.create().withConf(getConf())
            .withInputTable(mTable.getURI()).withGatherer(MyGatherer.class).withOutput(MapReduceJobOutputs
                    .newSequenceFileMapReduceJobOutput(new Path(outputDir.getPath()), numSplits))
            .build();// ww w  .jav  a2  s.  co m
    assertTrue(gatherJob.run());

    // Check the output file: two things should be there (apple, aardvark).
    final SequenceFile.Reader reader = FijiMRPlatformBridge.get().newSeqFileReader(getConf(),
            new Path(outputDir.getPath(), "part-m-00000"));
    try {
        final Text key = new Text();
        assertTrue(reader.next(key));
        assertEquals("aardvark", key.toString());
        assertTrue(reader.next(key));
        assertEquals("apple", key.toString());
        assertFalse(reader.next(key));
    } finally {
        reader.close();
    }
}

From source file:com.mozilla.grouperfish.mahout.clustering.display.lda.OriginalText.java

License:Apache License

public static Map<Integer, PriorityQueue<Pair<Double, String>>> getDocIds(Path docTopicsPath, int numDocs) {
    Map<Integer, PriorityQueue<Pair<Double, String>>> docIdMap = new HashMap<Integer, PriorityQueue<Pair<Double, String>>>();
    Map<Integer, Double> maxDocScores = new HashMap<Integer, Double>();
    SequenceFileDirectoryReader pointsReader = null;
    try {//  w w w  .java 2 s.  c o  m
        Text k = new Text();
        VectorWritable vw = new VectorWritable();
        pointsReader = new SequenceFileDirectoryReader(docTopicsPath);
        while (pointsReader.next(k, vw)) {
            String docId = k.toString();
            Vector normGamma = vw.get();
            Iterator<Element> iter = normGamma.iterateNonZero();
            double maxTopicScore = 0.0;
            int idx = 0;
            int topic = 0;
            while (iter.hasNext()) {
                Element e = iter.next();
                double score = e.get();
                if (score > maxTopicScore) {
                    maxTopicScore = score;
                    topic = idx;
                }

                idx++;
            }

            PriorityQueue<Pair<Double, String>> docIdsForTopic = docIdMap.get(topic);
            if (docIdsForTopic == null) {
                docIdsForTopic = new PriorityQueue<Pair<Double, String>>(numDocs);
            }

            Double maxDocScoreForTopic = maxDocScores.get(topic);
            if (maxDocScoreForTopic == null) {
                maxDocScoreForTopic = 0.0;
            }
            if (maxTopicScore > maxDocScoreForTopic) {
                maxDocScores.put(topic, maxTopicScore);
            }

            enqueue(docIdsForTopic, docId, maxTopicScore, numDocs);
            docIdMap.put(topic, docIdsForTopic);
        }
    } catch (IOException e) {
        LOG.error("IOException caught while reading clustered points", e);
    } finally {
        if (pointsReader != null) {
            pointsReader.close();
        }
    }

    for (Map.Entry<Integer, Double> entry : maxDocScores.entrySet()) {
        System.out.println("For topic: " + entry.getKey() + " max score: " + entry.getValue());
    }

    return docIdMap;
}