List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:com.littlehotspot.hadoop.mr.nginx.module.hdfs2hbase.api.user.UserMapper.java
License:Open Source License
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { try {/* w w w . j a v a2 s .c o m*/ String rowLineContent = value.toString(); // System.out.println(rowLineContent); Matcher matcher = CommonVariables.MAPPER_INPUT_FORMAT_REGEX.matcher(rowLineContent); if (!matcher.find()) { return; } String deviceId = matcher.group(16); if (StringUtils.isBlank(deviceId)) { return; } Text keyText = new Text(deviceId); System.out.println(rowLineContent); context.write(keyText, value); } catch (Exception e) { e.printStackTrace(); } }
From source file:com.littlehotspot.hadoop.mr.nginx.module.hdfs2hbase.api.user.UserReducer.java
License:Open Source License
@Override protected void reduce(Text key, Iterable<Text> value, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException { try {/*from ww w . j av a 2s .c om*/ Iterator<Text> textIterator = value.iterator(); TextTargetUserBean targetUserBean = new TextTargetUserBean(); while (textIterator.hasNext()) { Text item = textIterator.next(); if (item == null) { continue; } String rowLineContent = item.toString(); TextSourceUserBean sourceUserBean = new TextSourceUserBean(rowLineContent); targetUserBean.setDeviceId(sourceUserBean.getDeviceId()); targetUserBean.setDeviceType(sourceUserBean.getDeviceType()); targetUserBean.setMachineModel(sourceUserBean.getMachineModel()); targetUserBean.setSince(sourceUserBean.getChannelId()); // targetUserBean.setToken(); // targetUserBean.setDemandTime(); // targetUserBean.setProjectionTime(); this.setDownloadTime(sourceUserBean, targetUserBean); } CommonVariables.hBaseHelper.insert(targetUserBean); context.write(new Text(targetUserBean.rowLine()), new Text()); } catch (Exception e) { e.printStackTrace(); } }
From source file:com.lovelysystems.hive.udf.MemcachedUDF.java
License:Apache License
private MemcachedClient getClient(Text servers) throws IOException { if (!clients.containsKey(servers)) { MemcachedClient client = new MemcachedClient(defaultConnFactory, AddrUtil.getAddresses(servers.toString())); clients.put(servers, client);/*w w w.ja va 2 s .c om*/ return client; } else { return clients.get(servers); } }
From source file:com.lovelysystems.hive.udf.MemcachedUDF.java
License:Apache License
public IntWritable evaluate(Text servers, Text key, Text value) { if (key == null || servers == null) { result.set(-1);//from ww w . ja va 2 s. c om return result; } MemcachedClient client; try { client = getClient(servers); } catch (IOException e) { LOG.error("failed to get client: servers=" + servers + " key=" + key + " value=" + value, e); result.set(-1); return result; } Future<Boolean> f; if (value == null) { f = client.delete(key.toString()); result.set(2); } else { f = client.set(key.toString(), 0, value.toString()); result.set(1); } try { f.get(); } catch (Exception e) { LOG.error("failed to set value: servers=" + servers + " key=" + key + " value=" + value, e); result.set(-2); } return result; }
From source file:com.lovelysystems.hive.udf.UnescapeXMLUDF.java
License:Apache License
public Text evaluate(final Text s) { if (s == null) { return null; } else if (s.find("&") == -1) { res.set(s);// www.j a v a2 s .com } else { res.set(s.toString()); } return res; }
From source file:com.luca.filipponi.tweetAnalysis.SentimentClassifier.CustomTestNaiveBayesDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from w w w . j a v a 2s.co m addOutputOption(); addOption(addOption(DefaultOptionCreator.overwriteOption().create())); addOption("model", "m", "The path to the model built during training", true); addOption( buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false))); addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false))); addOption("labelIndex", "l", "The path to the location of the label index", true); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); } boolean complementary = hasOption("testComplementary"); boolean sequential = hasOption("runSequential"); if (sequential) { FileSystem fs = FileSystem.get(getConf()); NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf()); AbstractNaiveBayesClassifier classifier; if (complementary) { classifier = new ComplementaryNaiveBayesClassifier(model); } else { classifier = new StandardNaiveBayesClassifier(model); } SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class, VectorWritable.class); SequenceFile.Reader reader = new SequenceFile.Reader(fs, getInputPath(), getConf()); Text key = new Text(); VectorWritable vw = new VectorWritable(); while (reader.next(key, vw)) { writer.append(new Text(SLASH.split(key.toString())[1]), new VectorWritable(classifier.classifyFull(vw.get()))); } writer.close(); reader.close(); } else { boolean succeeded = runMapReduce(parsedArgs); if (!succeeded) { return -1; } } //load the labels Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex"))); //loop over the results and create the confusion matrix SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>( getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf()); ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT"); analyzeResults(labelMap, dirIterable, analyzer); log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer); return 0; }
From source file:com.lyft.hive.serde.DynamoDbSerDe.java
License:Apache License
@Override public Object deserialize(Writable blob) throws SerDeException { ArrayList<Object> row = buildRow(); Text rowText = (Text) blob; Map<String, String> values = decomposeRow(rowText.toString()); for (int c = 0; c < numColumns; c++) { try {//from w w w . jav a2s . c om String t = values.get(columnNames.get(c)); TypeInfo typeInfo = columnTypes.get(c); // Convert the column to the correct type when needed and set in row obj PrimitiveTypeInfo pti = (PrimitiveTypeInfo) typeInfo; switch (pti.getPrimitiveCategory()) { case STRING: row.set(c, t); break; case BYTE: Byte b; b = Byte.valueOf(t); row.set(c, b); break; case SHORT: Short s; s = Short.valueOf(t); row.set(c, s); break; case INT: Integer i; i = Integer.valueOf(t); row.set(c, i); break; case LONG: Long l; l = Long.valueOf(t); row.set(c, l); break; case FLOAT: Float f; f = Float.valueOf(t); row.set(c, f); break; case DOUBLE: Double d; d = Double.valueOf(t); row.set(c, d); break; case BOOLEAN: Boolean bool; bool = Boolean.valueOf(t); row.set(c, bool); break; case TIMESTAMP: row.set(c, parseTimestamp(t)); break; case DATE: Date date; date = Date.valueOf(t); row.set(c, date); break; case DECIMAL: HiveDecimal bd = HiveDecimal.create(t); row.set(c, bd); break; case CHAR: HiveChar hc = new HiveChar(t, ((CharTypeInfo) typeInfo).getLength()); row.set(c, hc); break; case VARCHAR: HiveVarchar hv = new HiveVarchar(t, ((VarcharTypeInfo) typeInfo).getLength()); row.set(c, hv); break; default: throw new SerDeException("Unsupported type " + typeInfo); } } catch (RuntimeException e) { row.set(c, null); } } return row; }
From source file:com.m6d.filecrush.crush.Crush.java
License:Apache License
private void cloneOutput() throws IOException { List<FileStatus> listStatus = getOutputMappings(); /*/*from ww w . j av a2 s . c om*/ * Initialize to empty list, in which case swap() will be a no-op. The reference is then replaced with a real list, which is * used in the subsequent iterations. */ List<Path> crushInput = emptyList(); Text srcFile = new Text(); Text crushOut = new Text(); Text prevCrushOut = new Text(); for (FileStatus partFile : listStatus) { Path path = partFile.getPath(); Reader reader = new Reader(fs, path, fs.getConf()); try { while (reader.next(srcFile, crushOut)) { if (!crushOut.equals(prevCrushOut)) { swap(crushInput, prevCrushOut.toString()); prevCrushOut.set(crushOut); crushInput = new LinkedList<Path>(); } crushInput.add(new Path(srcFile.toString())); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn("Trapped exception when closing " + path, e); } } swap(crushInput, prevCrushOut.toString()); } /* * Don't forget to move the files that were not crushed to the output dir so that the output dir has all the data that was in * the input dir, the difference being there are fewer files in the output dir. */ if (removableFiles.size() > 0) { String srcDirName = fs.makeQualified(srcDir).toUri().getPath(); String destName = fs.makeQualified(dest).toUri().getPath(); print(Verbosity.INFO, "\n\nMoving removed files to " + destName); for (String name : removableFiles) { Path srcPath = new Path(name); Path destPath = new Path(destName + name).getParent(); print(Verbosity.INFO, "\n Moving " + srcPath + " to " + destPath); rename(srcPath, destPath, null); } } }
From source file:com.m6d.filecrush.crush.Crush.java
License:Apache License
/** * Moves the skipped files to the output directory. Called when operation in normal (non-clone) mode. *//*from w w w. jav a 2 s .com*/ private void moveOutput() throws IOException { List<FileStatus> listStatus = getOutputMappings(); Text srcFile = new Text(); Text crushOut = new Text(); Set<String> crushOutputFiles = new HashSet<String>(nBuckets); for (FileStatus partFile : listStatus) { Path path = partFile.getPath(); Reader reader = new Reader(fs, path, fs.getConf()); try { while (reader.next(srcFile, crushOut)) { crushOutputFiles.add(new Path(crushOut.toString()).toUri().getPath()); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn("Trapped exception when closing " + path, e); } } } assert crushOutputFiles.size() == nBuckets; /* * The crushoutput files will appear in a subdirectory of the output directory. The subdirectory will be the full path of the * input directory that was crushed. E.g. * * Crush input: * /user/me/input/dir1/file1 * /user/me/input/dir1/file2 * /user/me/input/dir2/file3 * /user/me/input/dir2/file4 * /user/me/input/dir3/dir4/file5 * /user/me/input/dir3/dir4/file6 * * Crush output: * /user/me/output/user/me/input/dir1/crushed_file ... * /user/me/output/user/me/input/dir2/crushed_file ... * /user/me/output/user/me/input/dir2/dir3/dir4/crushed_file ... * * We need to collapse this down to: * /user/me/output/dir1/crushed_file ... * /user/me/output/dir2/crushed_file ... * /user/me/output/dir2/dir3/dir4/crushed_file ... */ String srcDirName = fs.makeQualified(srcDir).toUri().getPath(); String destName = fs.makeQualified(dest).toUri().getPath(); String partToReplace = fs.makeQualified(outDir).toUri().getPath() + "/crush" + srcDirName; for (String crushOutputFile : crushOutputFiles) { Path srcPath = new Path(crushOutputFile); Path destPath = new Path(destName + crushOutputFile.substring(partToReplace.length())).getParent(); print(Verbosity.INFO, "\n Renaming " + srcPath + " to " + destPath); rename(srcPath, destPath, null); } /* * Don't forget to move the files that were not crushed to the output dir so that the output dir has all the data that was in * the input dir, the difference being there are fewer files in the output dir. */ if (skippedFiles.size() > 0) { print(Verbosity.INFO, "\n\nMoving skipped files to " + destName); for (String name : skippedFiles) { Path srcPath = new Path(name); Path destPath = new Path(destName + name.substring(srcDirName.length())).getParent(); print(Verbosity.INFO, "\n Renaming " + srcPath + " to " + destPath); rename(srcPath, destPath, null); } } }
From source file:com.m6d.filecrush.crush.CrushReducer.java
License:Apache License
@Override public void reduce(Text bucketId, Iterator<Text> values, OutputCollector<Text, Text> collector, Reporter reporter) throws IOException { String bucket = bucketId.toString(); String dirName = bucket.substring(0, bucket.lastIndexOf('-')); int idx = findMatcher(dirName); String outputFileName = calculateOutputFile(idx, dirName); /*//from w w w . j av a2 s . c o m * Don't need to separate the paths because the output file name is already absolute. */ valueOut.set(outDirPath + outputFileName); LOG.info(format("Crushing bucket '%s' to file '%s'", bucket, outputFileName)); /* * Strip the leading slash to make the path relative. the output format will relativize it to the task attempt work dir. */ RecordWriter<Object, Object> sink = null; FileSinkOperator.RecordWriter parquetSink = null; Exception rootCause = null; Void voidKey = null; Object key = null; Object value = null; String schemaSignature = null; String columns = null; String columnsTypes = null; Properties jobProperties = new Properties(); boolean firstFile = true; try { while (null == rootCause && values.hasNext()) { Text srcFile = values.next(); Path inputPath = new Path(srcFile.toString()); RecordReader<Object, Object> reader = createRecordReader(idx, inputPath, reporter); if (firstFile) { firstFile = false; key = reader.createKey(); if (null == key) key = NullWritable.get(); value = reader.createValue(); if (AvroContainerInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) { schemaSignature = getAvroFileSchemaString(job, inputPath); job.set("avro.schema.literal", schemaSignature); } else if (MapredParquetInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) { MessageType schema = getParquetFileSchema(job, inputPath); List<Type> fieldsFromSchema = schema.getFields(); for (Type field : fieldsFromSchema) { if (field.getOriginalType() != null) { if (StringUtils.equals(field.getOriginalType().toString(), "DECIMAL")) { String primitiveType = field.asPrimitiveType().toString(); int loc = primitiveType.indexOf("DECIMAL"); int start = loc + 7; int end = primitiveType.indexOf(")", loc) + 1; String ps = primitiveType.substring(start, end); if (!decimalTypesHashMap.containsKey(ps)) { decimalTypesHashMap.put(field.getName().toString(), ps); } } } } schemaSignature = getParquetFileSchemaString(job, inputPath); StringBuilder columnsSb = new StringBuilder(); StringBuilder columnsTypesSb = new StringBuilder(); boolean firstColumn = true; for (ColumnDescriptor col : schema.getColumns()) { if (firstColumn) { firstColumn = false; } else { columnsSb.append(","); columnsTypesSb.append(","); } columnsSb.append(col.getPath()[0]); String typeName = col.getType().toString(); if ("INT96".equals(typeName)) typeName = "timestamp"; else if ("INT64".equals(typeName)) typeName = "bigint"; else if ("INT32".equals(typeName)) typeName = "int"; else if ("INT16".equals(typeName)) typeName = "smallint"; else if ("INT8".equals(typeName)) typeName = "tinyint"; else if ("BINARY".equals(typeName)) typeName = "string"; else if ("BOOLEAN".equals(typeName)) typeName = "boolean"; else if ("DOUBLE".equals(typeName)) typeName = "double"; else if ("FLOAT".equals(typeName)) typeName = "float"; else if (typeName.startsWith("FIXED_LEN_BYTE_ARRAY")) { String column = col.toString(); int start = column.indexOf('[') + 1; int end = column.indexOf(']'); String fieldName = column.substring(start, end); String lookupVal = decimalTypesHashMap.get(fieldName); LOG.info("final string: decimal" + lookupVal); typeName = "decimal" + lookupVal; } columnsTypesSb.append(typeName); } columns = columnsSb.toString(); columnsTypes = columnsTypesSb.toString(); jobProperties.put(IOConstants.COLUMNS, columns); jobProperties.put(IOConstants.COLUMNS_TYPES, columnsTypes); parquetSerDe = new ParquetHiveSerDe(); parquetSerDe.initialize(job, jobProperties); } else { schemaSignature = key.getClass().getName() + ":" + value.getClass().getName(); } /* * Set the key and value class in the conf, which the output format uses to get type information. */ job.setOutputKeyClass(key.getClass()); job.setOutputValueClass(value.getClass()); /* * Output file name is absolute so we can just add it to the crush prefix. */ if (MapredParquetOutputFormat.class.isAssignableFrom(getOutputFormatClass(idx))) { outputFormat = "parquet"; parquetSink = createParquetRecordWriter(idx, valueOut.toString(), jobProperties, (Class<? extends org.apache.hadoop.io.Writable>) value.getClass(), reporter); } else { outputFormat = getOutputFormatClass(idx).getName(); sink = createRecordWriter(idx, valueOut.toString()); } } else { // next files /* * Ensure schema signature is the same as the first file's */ String nextSchemaSignature = null; if (AvroContainerInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) { nextSchemaSignature = getAvroFileSchemaString(job, inputPath); } else if (MapredParquetInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) { nextSchemaSignature = getParquetFileSchemaString(job, inputPath); } else { Object otherKey = reader.createKey(); if (otherKey == null) otherKey = NullWritable.get(); nextSchemaSignature = otherKey.getClass().getName() + ":" + reader.createValue().getClass().getName(); } if (!schemaSignature.equals(nextSchemaSignature)) { throw new IllegalArgumentException( format("Heterogeneous schema detected in file %s: [%s] != [%s]", inputPath, nextSchemaSignature, schemaSignature)); } } boolean ret; if ("parquet".equals(outputFormat)) ret = reader.next(voidKey, value); else ret = reader.next(key, value); while (ret) { if ("text".equals(inputFormat)) sink.write(key, null); else if (sink != null) sink.write(key, value); else { ParquetHiveRecord parquetHiveRecord = new ParquetHiveRecord(value, (StructObjectInspector) parquetSerDe.getObjectInspector()); parquetSink.write(parquetHiveRecord); } reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1); if ("parquet".equals(outputFormat)) ret = reader.next(voidKey, value); else ret = reader.next(key, value); } // /* * Output of the reducer is the source file => crushed file (in the final output dir, no the task attempt work dir. */ collector.collect(srcFile, valueOut); reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1); recordNumber++; if (reportRecordNumber == recordNumber) { reportRecordNumber += reportRecordNumber; reporter.setStatus(format("Processed %,d files %s : %s", recordNumber, bucket, inputPath)); } } } catch (Exception e) { rootCause = e; } finally { if (null != sink) { try { sink.close(reporter); } catch (Exception e) { if (null == rootCause) { rootCause = e; } else { LOG.error("Swallowing exception on close of " + outputFileName, e); } } } if (null != parquetSink) { try { parquetSink.close(false); } catch (Exception e) { if (null == rootCause) { rootCause = e; } else { LOG.error("Swallowing exception on close of " + outputFileName, e); } } } /* * Let the exception bubble up with a minimum of wrapping. */ if (null != rootCause) { if (rootCause instanceof RuntimeException) { throw (RuntimeException) rootCause; } if (rootCause instanceof IOException) { throw (IOException) rootCause; } throw new RuntimeException(rootCause); } } }