List of usage examples for org.apache.hadoop.io Text set
public void set(Text other)
From source file:com.ibm.spss.hive.serde2.xml.objectinspector.ObjectInspectorTest.java
License:Open Source License
@SuppressWarnings("rawtypes") public void testSimpleXmlNotMap() throws SerDeException { XmlSerDe xmlSerDe = new XmlSerDe(); Configuration configuration = new Configuration(); Properties properties = new Properties(); properties.put(LIST_COLUMNS, "test"); properties.put(LIST_COLUMN_TYPES, "map<string,string>"); properties.setProperty("column.xpath.test", "//*[contains(name(),'test')]/text()"); xmlSerDe.initialize(configuration, properties); Text text = new Text(); text.set("<root><test1>string1</test1><test2>string2</test2></root>"); Object o = xmlSerDe.deserialize(text); XmlStructObjectInspector structInspector = ((XmlStructObjectInspector) xmlSerDe.getObjectInspector()); StructField structField = structInspector.getStructFieldRef("test"); Object data = structInspector.getStructFieldData(o, structField); XmlMapObjectInspector fieldInspector = (XmlMapObjectInspector) structField.getFieldObjectInspector(); Map map = fieldInspector.getMap(data); assertEquals(0, map.size());/*from w w w. ja va2 s.com*/ }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.ObjectNodeWritableComparable.java
License:Apache License
@Override public void write(DataOutput out) throws IOException { final Text text = new Text(); text.set(_object_node.toString()); text.write(out);//from w w w.j a v a 2 s . c o m }
From source file:com.inmobi.conduit.distcp.tools.CopyListing.java
License:Apache License
/** * Validate the final resulting path listing to see if there are any duplicate entries * * @param pathToListFile - path listing build by doBuildListing * @throws IOException - Any issues while checking for duplicates and throws * @throws DuplicateFileException - if there are duplicates *//*from ww w.j ava 2 s .c o m*/ protected void checkForDuplicates(Path pathToListFile) throws DuplicateFileException, IOException { Configuration config = getConf(); FileSystem fs = pathToListFile.getFileSystem(config); Path sortedList = DistCpUtils.sortListing(fs, config, pathToListFile); SequenceFile.Reader reader = new SequenceFile.Reader(fs, sortedList, config); try { Text lastKey = new Text("*"); //source relative path can never hold * FileStatus lastFileStatus = new FileStatus(); Text currentKey = new Text(); while (reader.next(currentKey)) { if (currentKey.equals(lastKey)) { FileStatus currentFileStatus = new FileStatus(); reader.getCurrentValue(currentFileStatus); throw new DuplicateFileException("File " + lastFileStatus.getPath() + " and " + currentFileStatus.getPath() + " would cause duplicates. Aborting"); } reader.getCurrentValue(lastFileStatus); lastKey.set(currentKey); } } finally { IOUtils.closeStream(reader); } }
From source file:com.jeffy.fbds.SequenceFileWriter.java
License:Apache License
public static void main(String[] args) throws IOException { // ?// w w w .j av a 2 s . co m String uri = args[0]; Configuration conf = new Configuration(); Path path = new Path(uri); IntWritable key = new IntWritable(); Text value = new Text(); try (SequenceFile.Writer writer = SequenceFile.createWriter(conf, Writer.file(path), Writer.keyClass(key.getClass()), Writer.valueClass(value.getClass()))) { for (int i = 0; i < 100; i++) { key.set(100 - i); value.set(DATA[i % DATA.length]); System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value); writer.append(key, value); } } }
From source file:com.jfolson.hive.serde.RBaseSerDe.java
License:Apache License
protected void serializeField(Object o, ObjectInspector oi, Object reuse) throws IOException { //LOG.info("Serializing hive type: "+oi.getTypeName()); //LOG.info("Serializing category: "+oi.getCategory().toString()); if (o == null) { tbOut.writeNull();// ww w.j a v a 2s.c o m return; } switch (oi.getCategory()) { case PRIMITIVE: { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; //LOG.info("Serializing primitive: "+poi.getPrimitiveCategory().toString()); switch (poi.getPrimitiveCategory()) { case VOID: { return; } case BINARY: { BinaryObjectInspector boi = (BinaryObjectInspector) poi; TypedBytesWritable bytes = reuse == null ? new TypedBytesWritable() : (TypedBytesWritable) reuse; BytesWritable bytesWrite = boi.getPrimitiveWritableObject(o); if (bytesWrite != null) { bytes.set(bytesWrite); if (!RType.isValid(bytes)) { LOG.error("Invalid typedbytes detected with type: " + RType.getType(bytes).code); bytes.setValue(new Buffer(bytesWrite.getBytes(), 0, bytesWrite.getLength())); } //LOG.info("Writing binary primitive with class: "+bytes.getClass().getName()); tbOut.write(bytes); } return; } case BOOLEAN: { BooleanObjectInspector boi = (BooleanObjectInspector) poi; BooleanWritable r = reuse == null ? new BooleanWritable() : (BooleanWritable) reuse; r.set(boi.get(o)); tbOut.write(r); return; } case BYTE: { ByteObjectInspector boi = (ByteObjectInspector) poi; ByteWritable r = reuse == null ? new ByteWritable() : (ByteWritable) reuse; r.set(boi.get(o)); tbOut.write(r); return; } case SHORT: { ShortObjectInspector spoi = (ShortObjectInspector) poi; ShortWritable r = reuse == null ? new ShortWritable() : (ShortWritable) reuse; r.set(spoi.get(o)); tbOut.write(r); return; } case INT: { IntObjectInspector ioi = (IntObjectInspector) poi; IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse; r.set(ioi.get(o)); tbOut.write(r); return; } case LONG: { LongObjectInspector loi = (LongObjectInspector) poi; LongWritable r = reuse == null ? new LongWritable() : (LongWritable) reuse; r.set(loi.get(o)); tbOut.write(r); return; } case FLOAT: { FloatObjectInspector foi = (FloatObjectInspector) poi; FloatWritable r = reuse == null ? new FloatWritable() : (FloatWritable) reuse; r.set(foi.get(o)); tbOut.write(r); return; } case DOUBLE: DoubleObjectInspector doi = (DoubleObjectInspector) poi; DoubleWritable r = reuse == null ? new DoubleWritable() : (DoubleWritable) reuse; r.set(doi.get(o)); tbOut.write(r); return; case STRING: { StringObjectInspector soi = (StringObjectInspector) poi; Text t = soi.getPrimitiveWritableObject(o); tbOut.write(t); return; } default: { throw new RuntimeException("Unrecognized type: " + poi.getPrimitiveCategory()); } } } case LIST: { ListObjectInspector loi = (ListObjectInspector) oi; ObjectInspector elemOI = loi.getListElementObjectInspector(); List l = loi.getList(o); // Don't use array (typecode: 144) until everything supports NA values in typedbytes if (false) {//(elemOI.getCategory()==ObjectInspector.Category.PRIMITIVE){ tbOut.writeArray(l, (PrimitiveObjectInspector) elemOI); } else { tbOut.writeVector(l, (PrimitiveObjectInspector) elemOI); } return; } case MAP: case STRUCT: { // For complex object, serialize to JSON format String s = SerDeUtils.getJSONString(o, oi); Text t = reuse == null ? new Text() : (Text) reuse; // convert to Text and write it t.set(s); tbOut.write(t); return; } default: { throw new RuntimeException("Unrecognized type: " + oi.getCategory()); } } }
From source file:com.jfolson.hive.serde.RTypedBytesSerDe.java
License:Apache License
private void serializeField(Object o, ObjectInspector oi, Object reuse) throws IOException { //LOG.info("Serializing hive type: "+oi.getTypeName()); //LOG.info("Serializing category: "+oi.getCategory().toString()); if (o == null) { tbOut.writeNull();/*from ww w . j a v a 2 s . co m*/ return; } switch (oi.getCategory()) { case PRIMITIVE: { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; //LOG.info("Serializing primitive: "+poi.getPrimitiveCategory().toString()); switch (poi.getPrimitiveCategory()) { case VOID: { return; } case BINARY: { BinaryObjectInspector boi = (BinaryObjectInspector) poi; TypedBytesWritable bytes = reuse == null ? new TypedBytesWritable() : (TypedBytesWritable) reuse; BytesWritable bytesWrite = boi.getPrimitiveWritableObject(o); if (bytesWrite != null) { bytes.set(bytesWrite); if (!RType.isValid(bytes)) { LOG.error("Invalid typedbytes detected with type: " + RType.getType(bytes).code); bytes.setValue(new Buffer(bytesWrite.getBytes(), 0, bytesWrite.getLength())); } //LOG.info("Writing binary primitive with class: "+bytes.getClass().getName()); tbOut.write(bytes); } return; } case BOOLEAN: { BooleanObjectInspector boi = (BooleanObjectInspector) poi; BooleanWritable r = reuse == null ? new BooleanWritable() : (BooleanWritable) reuse; r.set(boi.get(o)); tbOut.write(r); return; } case BYTE: { ByteObjectInspector boi = (ByteObjectInspector) poi; ByteWritable r = reuse == null ? new ByteWritable() : (ByteWritable) reuse; r.set(boi.get(o)); tbOut.write(r); return; } case SHORT: { ShortObjectInspector spoi = (ShortObjectInspector) poi; ShortWritable r = reuse == null ? new ShortWritable() : (ShortWritable) reuse; r.set(spoi.get(o)); tbOut.write(r); return; } case INT: { IntObjectInspector ioi = (IntObjectInspector) poi; IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse; r.set(ioi.get(o)); tbOut.write(r); return; } case LONG: { LongObjectInspector loi = (LongObjectInspector) poi; LongWritable r = reuse == null ? new LongWritable() : (LongWritable) reuse; r.set(loi.get(o)); tbOut.write(r); return; } case FLOAT: { FloatObjectInspector foi = (FloatObjectInspector) poi; FloatWritable r = reuse == null ? new FloatWritable() : (FloatWritable) reuse; r.set(foi.get(o)); tbOut.write(r); return; } case DOUBLE: DoubleObjectInspector doi = (DoubleObjectInspector) poi; DoubleWritable r = reuse == null ? new DoubleWritable() : (DoubleWritable) reuse; r.set(doi.get(o)); tbOut.write(r); return; case STRING: { StringObjectInspector soi = (StringObjectInspector) poi; Text t = soi.getPrimitiveWritableObject(o); tbOut.write(t); return; } default: { throw new RuntimeException("Unrecognized type: " + poi.getPrimitiveCategory()); } } } case LIST: { ListObjectInspector loi = (ListObjectInspector) oi; ObjectInspector elemOI = loi.getListElementObjectInspector(); List l = loi.getList(o); if (false) {//(elemOI.getCategory()==ObjectInspector.Category.PRIMITIVE){ tbOut.writeArray(l, (PrimitiveObjectInspector) elemOI); } else { tbOut.writeVector(l, (PrimitiveObjectInspector) elemOI); } return; } case MAP: case STRUCT: { // For complex object, serialize to JSON format String s = SerDeUtils.getJSONString(o, oi); Text t = reuse == null ? new Text() : (Text) reuse; // convert to Text and write it t.set(s); tbOut.write(t); return; } default: { throw new RuntimeException("Unrecognized type: " + oi.getCategory()); } } }
From source file:com.jfolson.hive.serde.RTypedBytesWritableInput.java
License:Apache License
public Text readText(Text t) throws IOException { if (t == null) { t = new Text(); }/*from w w w.j ava 2 s . c o m*/ t.set(in.readString()); return t; }
From source file:com.lucidworks.hadoop.utils.ZipFileRecordReader.java
License:Apache License
/** * Each ZipEntry is decompressed and readied for the Mapper. If the * ZipFileInputFormat has been set to Lenient (not the default), certain * exceptions will be gracefully ignored to prevent a larger job from * failing.// w ww.jav a2 s. co m */ @Override public boolean next(Text key, BytesWritable value) throws IOException { { ZipEntry entry = null; try { entry = zip.getNextEntry(); } catch (Throwable e) { if (!ZipFileInputFormat.getLenient()) { throw new RuntimeException(e); } } // Sanity check if (entry == null) { processed = true; return false; } // Filename key.set(new Text(entry.getName())); byte[] bufferOut = null; int cummulativeBytesRead = 0; while (true) { int bytesRead = 0; byte[] bufferIn = new byte[8192]; try { bytesRead = zip.read(bufferIn, 0, bufferIn.length); } catch (Throwable e) { if (!ZipFileInputFormat.getLenient()) { throw new RuntimeException(e); } return false; } if (bytesRead > 0) { byte[] tmp = head(bufferIn, bytesRead); if (cummulativeBytesRead == 0) { bufferOut = tmp; } else { bufferOut = add(bufferOut, tmp); } cummulativeBytesRead += bytesRead; } else { break; } } try { zip.closeEntry(); } catch (IOException e) { if (!ZipFileInputFormat.getLenient()) { throw new RuntimeException(e); } } // Uncompressed contents if (bufferOut != null) { value.setCapacity(bufferOut.length); value.set(bufferOut, 0, bufferOut.length); } else { log.warn("bufferOut is null for " + key);//should we return false here? I don't think so, since I think that would mean we can't process any more records } return true; } }
From source file:com.m6d.filecrush.crush.Crush.java
License:Apache License
private void cloneOutput() throws IOException { List<FileStatus> listStatus = getOutputMappings(); /*/*from w w w. j a v a 2 s. c o m*/ * Initialize to empty list, in which case swap() will be a no-op. The reference is then replaced with a real list, which is * used in the subsequent iterations. */ List<Path> crushInput = emptyList(); Text srcFile = new Text(); Text crushOut = new Text(); Text prevCrushOut = new Text(); for (FileStatus partFile : listStatus) { Path path = partFile.getPath(); Reader reader = new Reader(fs, path, fs.getConf()); try { while (reader.next(srcFile, crushOut)) { if (!crushOut.equals(prevCrushOut)) { swap(crushInput, prevCrushOut.toString()); prevCrushOut.set(crushOut); crushInput = new LinkedList<Path>(); } crushInput.add(new Path(srcFile.toString())); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn("Trapped exception when closing " + path, e); } } swap(crushInput, prevCrushOut.toString()); } /* * Don't forget to move the files that were not crushed to the output dir so that the output dir has all the data that was in * the input dir, the difference being there are fewer files in the output dir. */ if (removableFiles.size() > 0) { String srcDirName = fs.makeQualified(srcDir).toUri().getPath(); String destName = fs.makeQualified(dest).toUri().getPath(); print(Verbosity.INFO, "\n\nMoving removed files to " + destName); for (String name : removableFiles) { Path srcPath = new Path(name); Path destPath = new Path(destName + name).getParent(); print(Verbosity.INFO, "\n Moving " + srcPath + " to " + destPath); rename(srcPath, destPath, null); } } }
From source file:com.m6d.filecrush.crush.Crush.java
License:Apache License
void writeDirs() throws IOException { print(Verbosity.INFO, "\nUsing temporary directory " + tmpDir.toUri().getPath() + "\n"); FileStatus status = fs.getFileStatus(srcDir); Path tmpIn = new Path(tmpDir, "in"); bucketFiles = new Path(tmpIn, "dirs"); partitionMap = new Path(tmpIn, "partition-map"); counters = new Path(tmpIn, "counters"); skippedFiles = new HashSet<String>(); removableFiles = new HashSet<String>(); /*/*from w w w. j a va2s. co m*/ * Prefer the path returned by the status because it is always fully qualified. */ List<Path> dirs = asList(status.getPath()); Text key = new Text(); Text value = new Text(); Bucketer partitionBucketer = new Bucketer(maxTasks, 0, false); partitionBucketer.reset("partition-map"); jobCounters = new Counters(); int fileCount = 0; //Path bucketFile = new Path(tmpIn, "dirs_" + fileCount++); Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK); try { while (!dirs.isEmpty()) { List<Path> nextLevel = new LinkedList<Path>(); for (Path dir : dirs) { String dirPath = dir.toUri().getPath(); print(Verbosity.INFO, "\n\n[" + dirPath + "]"); jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); FileStatus[] contents = fs.listStatus(dir, new PathFilter() { @Override public boolean accept(Path testPath) { if (ignoredFilesMatcher == null) return true; ignoredFilesMatcher.reset(testPath.toUri().getPath()); boolean ignores = ignoredFilesMatcher.matches(); if (ignores) LOG.info("Ignoring file " + testPath); return !ignores; } }); if (contents == null || contents.length == 0) { print(Verbosity.INFO, "\n Directory is empty"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length); Set<String> uncrushedFiles = new HashSet<String>(contents.length); long crushableBytes = 0; /* * Queue sub directories for subsequent inspection and examine the files in this directory. */ for (FileStatus content : contents) { Path path = content.getPath(); if (content.isDir()) { nextLevel.add(path); } else { String filePath = path.toUri().getPath(); boolean skipFile = false; if (skippedFilesMatcher != null) { skippedFilesMatcher.reset(filePath); if (skippedFilesMatcher.matches()) { skipFile = true; } } boolean changed = uncrushedFiles.add(filePath); assert changed : path.toUri().getPath(); long fileLength = content.getLen(); if (!skipFile && fileLength <= maxEligibleSize) { if (removeEmptyFiles && fileLength == 0) removableFiles.add(filePath); else { crushables.add(content); crushableBytes += fileLength; } } } } /* * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the * number of files we found. */ if (!uncrushedFiles.isEmpty()) { if (-1 == findMatcher(dir)) { throw new IllegalArgumentException( "Could not find matching regex for directory: " + dir); } jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size()); } if (0 == crushableBytes) { print(Verbosity.INFO, "\n Directory has no crushable files"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { /* * We found files to consider for crushing. */ long nBlocks = crushableBytes / dfsBlockSize; if (nBlocks * dfsBlockSize != crushableBytes) { nBlocks++; } /* * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory. */ long dirBuckets = nBlocks / maxFileBlocks; if (dirBuckets * maxFileBlocks != nBlocks) { dirBuckets++; } if (dirBuckets > Integer.MAX_VALUE) { throw new AssertionError("Too many buckets: " + dirBuckets); } Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs); directoryBucketer.reset(getPathPart(dir)); for (FileStatus file : crushables) { directoryBucketer.add(new FileStatusHasSize(file)); } List<Bucket> crushFiles = directoryBucketer.createBuckets(); if (crushFiles.isEmpty()) { jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); print(Verbosity.INFO, "\n Directory skipped"); } else { nBuckets += crushFiles.size(); jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); print(Verbosity.INFO, "\n Generating " + crushFiles.size() + " output files"); /* * Write out the mapping between a bucket and a file. */ for (Bucket crushFile : crushFiles) { String bucketId = crushFile.name(); List<String> filesInBucket = crushFile.contents(); print(Verbosity.INFO, format("\n Output %s will include %,d input bytes from %,d files", bucketId, crushFile.size(), filesInBucket.size())); key.set(bucketId); for (String f : filesInBucket) { boolean changed = uncrushedFiles.remove(f); assert changed : f; pathMatcher.reset(f); pathMatcher.matches(); value.set(pathMatcher.group(5)); /* * Write one row per file to maximize the number of mappers */ writer.append(key, value); /* * Print the input file with four leading spaces. */ print(Verbosity.VERBOSE, "\n " + f); } jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, filesInBucket.size()); partitionBucketer.add(crushFile); } } } if (!removableFiles.isEmpty()) { print(Verbosity.INFO, "\n Marked " + removableFiles.size() + " files for removal"); for (String removable : removableFiles) { uncrushedFiles.remove(removable); print(Verbosity.VERBOSE, "\n " + removable); } jobCounters.incrCounter(MapperCounter.FILES_REMOVED, removableFiles.size()); } if (!uncrushedFiles.isEmpty()) { print(Verbosity.INFO, "\n Skipped " + uncrushedFiles.size() + " files"); for (String uncrushed : uncrushedFiles) { print(Verbosity.VERBOSE, "\n " + uncrushed); } jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size()); } skippedFiles.addAll(uncrushedFiles); } } dirs = nextLevel; } } finally { writer.close(); } /* * Now that we have processed all the directories, write the partition map. */ List<Bucket> partitions = partitionBucketer.createBuckets(); assert partitions.size() <= maxTasks; writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); IntWritable partNum = new IntWritable(); int totalReducers = 0; for (Bucket partition : partitions) { String partitionName = partition.name(); int p = Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1)); partNum.set(p); if (partition.contents().size() > 0) totalReducers++; for (String bucketId : partition.contents()) { key.set(bucketId); writer.append(key, partNum); } } writer.close(); print(Verbosity.INFO, "\n\nNumber of allocated reducers = " + totalReducers); job.setInt("mapreduce.job.reduces", totalReducers); DataOutputStream countersStream = fs.create(this.counters); jobCounters.write(countersStream); countersStream.close(); }