List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:com.me.neu.popular_tag_year.Mapper1.java
@Override public void map(LongWritable k1, Text v1, OutputCollector<Text, Text> output, Reporter rprtr) throws IOException { String[] line = v1.toString().split(","); String[] tagNames = line[0].split(">"); for (String tagName : tagNames) { output.collect(new Text(line[1]), new Text(tagName)); }/*from w ww . j a v a2s . c om*/ }
From source file:com.me.neu.stackoverflow.Mapper1.java
@Override public void map(LongWritable k1, Text v1, OutputCollector<Text, Text> output, Reporter rprtr) throws IOException { String[] line = v1.toString().split(","); String[] tagNames = line[1].split(">"); for (String tagName : tagNames) { output.collect(new Text(line[0]), new Text(tagName)); }//from ww w .j a v a 2 s. c om }
From source file:com.mh2c.WikipediaDumpLoaderMapper.java
License:Apache License
/** * key = article content//from w w w .j a v a2 s . c om * value = empty string */ @Override public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { try { // Parse the page of XML into a document Document doc = db.parse(new InputSource(new StringReader(key.toString()))); // Extract the title and text (article content) from the page content String title = doc.getElementsByTagName("title").item(0).getTextContent(); String text = doc.getElementsByTagName("text").item(0).getTextContent(); // Emit the title and text pair output.collect(new Text(title), new Text(text)); reporter.getCounter(Counter.ARTICLES).increment(1L); } catch (SAXException e) { throw new IOException(e); } }
From source file:com.mh2c.WikipediaWordCountMapper.java
License:Apache License
/** * key = title//w w w .ja v a 2 s.c om * value = text */ @Override public void map(Text key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { // Split the text content of the article on whitespace String[] words = value.toString().split("\\s+"); // Count each word occurrence for (String word : words) { wordText.set(word); output.collect(wordText, ONE); } }
From source file:com.missionsky.scp.dataanalysis.mahout.TestNaiveBayesDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from www . ja va 2 s . co m addOutputOption(); addOption(addOption(DefaultOptionCreator.overwriteOption().create())); addOption("model", "m", "The path to the model built during training", true); addOption( buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false))); addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false))); addOption("labelIndex", "l", "The path to the location of the label index", true); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); } boolean complementary = hasOption("testComplementary"); boolean sequential = hasOption("runSequential"); if (sequential) { FileSystem fs = FileSystem.get(getConf()); NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf()); AbstractNaiveBayesClassifier classifier; if (complementary) { classifier = new ComplementaryNaiveBayesClassifier(model); } else { classifier = new StandardNaiveBayesClassifier(model); } SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class, VectorWritable.class); Reader reader = new Reader(fs, getInputPath(), getConf()); Text key = new Text(); VectorWritable vw = new VectorWritable(); while (reader.next(key, vw)) { writer.append(new Text(SLASH.split(key.toString())[1]), new VectorWritable(classifier.classifyFull(vw.get()))); } writer.close(); reader.close(); } else { boolean succeeded = runMapReduce(parsedArgs); if (!succeeded) { return -1; } } //load the labels Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex"))); //loop over the results and create the confusion matrix SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>( getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf()); ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT"); analyzeResults(labelMap, dirIterable, analyzer); log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer); return 0; }
From source file:com.ml.hadoop.nlp.SequenceFileTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString())); stream.reset();//w w w.j a v a 2 s .c o m CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); Closeables.close(stream, true); //drop stop words document = StopWordsHandler.dropStopWords(document); context.write(key, document); }
From source file:com.ML_Hadoop.K_meansClustering.K_meansClusteringMap.java
@Override // the word '@Override' is necessary. otherwise it runs defaults map() public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // Read each line of input file and extract each value of data String value_of_a_line_of_a_input_file = value.toString(); String[] split_data_of_the_line = value_of_a_line_of_a_input_file.split("\t"); // split line by TAB Float[] values_of_data_of_the_line = new Float[feature_size]; for (int i = 0; i < feature_size; i++) values_of_data_of_the_line[i] = Float.parseFloat(split_data_of_the_line[i]); similarityCheck(values_of_data_of_the_line); }
From source file:com.ML_Hadoop.MultipleLinearRegression.MultipleLinearRegressionMap.java
@Override // necessary otherwise it runs default map() public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] features = line.split(","); // split text into two texts: x, y ArrayList<Float> values = new ArrayList<Float>(); for (int i = 0; i < features.length; i++) values.add(new Float(features[i])); prediction_error.add(compute_J(values)); }
From source file:com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features.NaiveBayesClassifierMap_Continuous_Features.java
@Override // necessary otherwise it runs default map() public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] features_str = line.split(","); // split text into separate strings class_id = Integer.parseInt(features_str[0]); features = new Float[features_str.length - 1]; for (int i = 0; i < features.length; i++) { features[i] = Float.parseFloat(features_str[i + 1]); }// w w w. j av a 2s . c o m ArrayList<Float[]> t; if (features_probabilities.get(class_id) == null) { t = new ArrayList<Float[]>(); } else { t = features_probabilities.get(class_id); } t.add(features); features_probabilities.put(class_id, t); num_of_members_in_each_class[class_id]++; }
From source file:com.mortardata.pig.JsonLoader.java
License:Apache License
@Override public Tuple getNext() throws IOException { Text val = null; try {// w w w.j a v a 2 s .co m if (!reader.nextKeyValue()) return null; val = (Text) reader.getCurrentValue(); } catch (Exception e) { throw new IOException(e); } // Create a parser specific for this input line. // This may not be the most efficient approach. ByteArrayInputStream bais = new ByteArrayInputStream(val.getBytes()); JsonParser p = jsonFactory.createJsonParser(bais); Tuple t; // schema provided if (!useDefaultSchema) { // Create a map of field names to ResourceFieldSchema's, // and create a map of field names to positions in the tuple. // These are used during parsing to handle extra, missing, and/or out-of-order // fields properly. Map<String, ResourceFieldSchema> schemaMap = new HashMap<String, ResourceFieldSchema>(); Map<String, Integer> schemaPositionMap = new HashMap<String, Integer>(); if (requiredFields != null) { int count = 0; for (int i = 0; i < fields.length; i++) { if (requiredFields[i]) { schemaMap.put(fields[i].getName(), fields[i]); schemaPositionMap.put(fields[i].getName(), count); count++; } } t = tupleFactory.newTuple(count); } else { for (int i = 0; i < fields.length; i++) { schemaMap.put(fields[i].getName(), fields[i]); schemaPositionMap.put(fields[i].getName(), i); } t = tupleFactory.newTuple(fields.length); } try { p.nextToken(); // move to start of object parseObjectIntoTuple(val.toString(), p, schemaMap, schemaPositionMap, t); } catch (JsonParseException jpe) { // If the line doesn't parse as a valid JSON object, log an error and move on log.error("Error parsing record: " + val + ": " + jpe.toString()); } } else { // schema not provided: load whole document as a map t = tupleFactory.newTuple(1); try { p.nextToken(); // move to start of object t.set(0, readField(val.toString(), p, schema.getFields()[0])); } catch (JsonParseException jpe) { log.error("Error parsing record: " + val + ": " + jpe.toString()); } } p.close(); return t; }