List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:com.moz.fiji.mapreduce.lib.bulkimport.DescribedInputTextBulkImporter.java
License:Apache License
/** * Post-processes rejected lines(Logging, keeping count, etc). * * @param line the line that was rejected by the producer. * @param context the context in which the rejection occured. * @param reason the reason why this line was rejected. *//*ww w . ja v a 2s .c o m*/ public void reject(Text line, FijiTableContext context, String reason) { if (mRejectedLineCounter % mLogRate == 0L) { LOG.error("Rejecting line: {} with reason: {}", line.toString(), reason); } mRejectedLineCounter++; //TODO(FIJIMRLIB-9) Abort this bulk importer job early if rejected records exceed a threshold context.incrementCounter(JobHistoryCounters.BULKIMPORTER_RECORDS_REJECTED); //TODO(FIJIMRLIB-4) Allow this to emit to a rejected output so that import can be reattempted. }
From source file:com.moz.fiji.mapreduce.lib.bulkimport.JSONBulkImporter.java
License:Apache License
/** {@inheritDoc} */ @Override//from www . j a va2 s.c o m public void produce(Text value, FijiTableContext context) throws IOException { JsonObject gson = new JsonParser().parse(value.toString()).getAsJsonObject(); for (FijiColumnName fijiColumnName : getDestinationColumns()) { String entityIdSource = getFromPath(gson, getEntityIdSource()); if (entityIdSource == null) { LOG.error("Unable to retrieve entityId from source field: " + getEntityIdSource()); return; } final EntityId eid = context.getEntityId(entityIdSource); String source = getSource(fijiColumnName); String fieldValue = getFromPath(gson, source); if (fieldValue != null) { String family = fijiColumnName.getFamily(); String qualifier = fijiColumnName.getQualifier(); if (isOverrideTimestamp()) { // Override the timestamp from the imported source String timestampSource = getFromPath(gson, getTimestampSource()); Long timestamp = Long.parseLong(timestampSource); context.put(eid, family, qualifier, timestamp, convert(fijiColumnName, fieldValue)); } else { // Use the system time as the timestamp context.put(eid, family, qualifier, convert(fijiColumnName, fieldValue)); } } else { incomplete(value, context, "Detected missing field: " + source); } } }
From source file:com.moz.fiji.mapreduce.lib.bulkimport.XMLBulkImporter.java
License:Apache License
/** {@inheritDoc} */ @Override//from ww w .j av a2s . c o m public void produce(Text xmlText, FijiTableContext context) throws IOException { String xml = xmlText.toString(); // Prepare the document builder and XPath. DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder documentBuilder = null; Document document = null; XPathFactory xPathFactory = XPathFactory.newInstance(); XPath xPath = xPathFactory.newXPath(); try { // Parse the record as an XML document documentBuilder = documentBuilderFactory.newDocumentBuilder(); InputSource inputSource = new InputSource(); inputSource.setCharacterStream(new StringReader(xml)); try { document = documentBuilder.parse(inputSource); } catch (SAXException saxe) { reject(xmlText, context, "Failed to parse XML."); return; } } catch (ParserConfigurationException pce) { // Should be unreachable, since the default configuration is used. reject(xmlText, context, "Invalid parser configuration."); return; } // Get the entityId. String entityIdStr = null; try { entityIdStr = xPath.compile(getEntityIdSource()).evaluate(document); if (entityIdStr == null || entityIdStr.isEmpty()) { reject(xmlText, context, "Unable to retrieve entityId from source field."); return; } } catch (XPathExpressionException xpee) { // Should be unreachable, errors caught in setupImporter(). LOG.error("Invalid XPath expression: " + getEntityIdSource()); throw new RuntimeException("Invalid XPath expression: " + getEntityIdSource()); } final EntityId eid = context.getEntityId(entityIdStr); // Get the timestamp. Long timestamp = null; if (isOverrideTimestamp()) { String timestampSource = null; try { timestampSource = xPath.compile(getTimestampSource()).evaluate(document); } catch (XPathExpressionException xpee) { // Should be unreachable, errors caught in setupImporter(). LOG.error("Invalid XPath expression: " + getTimestampSource()); throw new RuntimeException("Invalid XPath expression: " + getTimestampSource()); } try { final int time = 0; timestamp = Long.parseLong(timestampSource); } catch (NumberFormatException nfe) { incomplete(xmlText, context, "Detected missing field: " + getTimestampSource()); } } else { // If timestamp is not overridden in the import descriptor, use the current system time for // all writes to this row. timestamp = System.currentTimeMillis(); } // For each output column, traverse the XML document with XPath and write the data. for (FijiColumnName fijiColumnName : getDestinationColumns()) { String source = getSource(fijiColumnName); String fieldValue = null; try { fieldValue = xPath.compile(source).evaluate(document); } catch (XPathExpressionException xpee) { // Should be unreachable, errors caught in setupImporter(). LOG.error("Invalid XPath expression: " + source); throw new RuntimeException("Invalid XPath expression: " + source); } if (fieldValue != null && !fieldValue.isEmpty()) { String family = fijiColumnName.getFamily(); String qualifier = fijiColumnName.getQualifier(); context.put(eid, family, qualifier, timestamp, convert(fijiColumnName, fieldValue)); } else { incomplete(xmlText, context, "Detected missing field: " + source); } } }
From source file:com.moz.fiji.mapreduce.lib.examples.News20BulkImporter.java
License:Apache License
/** * Reads a single news article, and writes its contents to a new fiji row, * indexed by the article's name (A string consisting of the parent folder, and * this article's hash), and the a priori categorization of this article. * * @param key The fully qualified path to the current file we're reading. * @param value The raw data to insert into this column. * @param context The context to write to. * @throws IOException if there is an error. *//* ww w . ja v a 2 s . co m*/ @Override public void produce(Text key, Text value, FijiTableContext context) throws IOException { Path qualifiedPath = new Path(key.toString()); // Category is specified on the containing folder. String category = qualifiedPath.getParent().getName(); // Name is the concatenation of category and file name. String name = category + "." + qualifiedPath.getName(); // write name, category, and raw article. EntityId entity = context.getEntityId(name); context.put(entity, FAMILY, ARTICLE_NAME_QUALIFIER, name); context.put(entity, FAMILY, CATEGORY_QUALIFIER, category); context.put(entity, FAMILY, RAW_ARTICLE_QUALIFIER, value.toString()); }
From source file:com.moz.fiji.mapreduce.lib.examples.TextListReducer.java
License:Apache License
/** {@inheritDoc} */ @Override/*from w w w . ja v a 2s . c o m*/ public void reduce(K key, Iterable<Text> values, Context context) throws IOException, InterruptedException { // Construct a list to hold all of the values in the group. List<CharSequence> aggregateList = new ArrayList<CharSequence>(); // Add each input value to our aggregate list. for (Text value : values) { aggregateList.add(value.toString()); } // Put the aggregate list into the wrapper AvroValue to be written. mValue.datum(aggregateList); // Write the output key and its aggregate list of values. context.write(key, mValue); }
From source file:com.moz.fiji.mapreduce.lib.map.TextFlatteningMapper.java
License:Apache License
/** * Converts The Bytes stored in fileContents to a single String with all new-line * characters removed./*from www. j a v a2s .c o m*/ * * @param fileName The qualified path to this file. * @param fileContents The file to convert, encoded in UTF8. * @param context The Context to write to. * @throws IOException if there is an error. * @throws InterruptedException if there is an error. */ @Override protected void map(Text fileName, Text fileContents, Context context) throws IOException, InterruptedException { // Run over file and remove each newline character. // These files are expected to be small (and already fit in a Text object) // so we should be able to toString() them. String text = fileContents.toString(); // Replace all newlines with spaces. String withoutNewlines = text.replaceAll("\n", " "); mFlattenedFile.set(withoutNewlines); context.write(fileName, mFlattenedFile); }
From source file:com.moz.fiji.mapreduce.testlib.SimpleBulkImporter.java
License:Apache License
/** {@inheritDoc} */ @Override/* ww w . j a v a 2s .c om*/ public void produce(LongWritable filePos, Text value, FijiTableContext context) throws IOException { final String line = value.toString(); final String[] split = line.split(":"); Preconditions.checkState(split.length == 2, String.format("Unable to parse bulk-import test input line: '%s'.", line)); final String rowKey = split[0]; final int integerValue = Integer.parseInt(split[1]); final EntityId eid = context.getEntityId(rowKey); context.put(eid, "primitives", "int", integerValue); context.put(eid, "primitives", "long", filePos.get()); context.put(eid, "primitives", "string", String.format("%s-%d", rowKey, integerValue)); }
From source file:com.moz.fiji.schema.filter.TestRegexQualifierColumnFilter.java
License:Apache License
@Test public void testRegexQualifierColumnFilter() throws Exception { final File outputDir = File.createTempFile("gatherer-output", ".dir", getLocalTempDir()); Preconditions.checkState(outputDir.delete()); final int numSplits = 1; // Run a gatherer over the test_table. final FijiMapReduceJob gatherJob = FijiGatherJobBuilder.create().withConf(getConf()) .withInputTable(mTable.getURI()).withGatherer(MyGatherer.class).withOutput(MapReduceJobOutputs .newSequenceFileMapReduceJobOutput(new Path(outputDir.getPath()), numSplits)) .build();// ww w .jav a2 s. co m assertTrue(gatherJob.run()); // Check the output file: two things should be there (apple, aardvark). final SequenceFile.Reader reader = FijiMRPlatformBridge.get().newSeqFileReader(getConf(), new Path(outputDir.getPath(), "part-m-00000")); try { final Text key = new Text(); assertTrue(reader.next(key)); assertEquals("aardvark", key.toString()); assertTrue(reader.next(key)); assertEquals("apple", key.toString()); assertFalse(reader.next(key)); } finally { reader.close(); } }
From source file:com.mozilla.grouperfish.mahout.clustering.display.lda.OriginalText.java
License:Apache License
public static Map<Integer, PriorityQueue<Pair<Double, String>>> getDocIds(Path docTopicsPath, int numDocs) { Map<Integer, PriorityQueue<Pair<Double, String>>> docIdMap = new HashMap<Integer, PriorityQueue<Pair<Double, String>>>(); Map<Integer, Double> maxDocScores = new HashMap<Integer, Double>(); SequenceFileDirectoryReader pointsReader = null; try {// w w w .java 2 s. c o m Text k = new Text(); VectorWritable vw = new VectorWritable(); pointsReader = new SequenceFileDirectoryReader(docTopicsPath); while (pointsReader.next(k, vw)) { String docId = k.toString(); Vector normGamma = vw.get(); Iterator<Element> iter = normGamma.iterateNonZero(); double maxTopicScore = 0.0; int idx = 0; int topic = 0; while (iter.hasNext()) { Element e = iter.next(); double score = e.get(); if (score > maxTopicScore) { maxTopicScore = score; topic = idx; } idx++; } PriorityQueue<Pair<Double, String>> docIdsForTopic = docIdMap.get(topic); if (docIdsForTopic == null) { docIdsForTopic = new PriorityQueue<Pair<Double, String>>(numDocs); } Double maxDocScoreForTopic = maxDocScores.get(topic); if (maxDocScoreForTopic == null) { maxDocScoreForTopic = 0.0; } if (maxTopicScore > maxDocScoreForTopic) { maxDocScores.put(topic, maxTopicScore); } enqueue(docIdsForTopic, docId, maxTopicScore, numDocs); docIdMap.put(topic, docIdsForTopic); } } catch (IOException e) { LOG.error("IOException caught while reading clustered points", e); } finally { if (pointsReader != null) { pointsReader.close(); } } for (Map.Entry<Integer, Double> entry : maxDocScores.entrySet()) { System.out.println("For topic: " + entry.getKey() + " max score: " + entry.getValue()); } return docIdMap; }