Example usage for org.apache.hadoop.io Text set

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text set.

Prototype

public void set(Text other)

Source Link

Document

copy a text.

Usage

From source file:io.covert.binary.analysis.BuildSequenceFile.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    File inDir = new File(args[0]);
    Path name = new Path(args[1]);

    Text key = new Text();
    BytesWritable val = new BytesWritable();

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, name, Text.class, BytesWritable.class,
            CompressionType.RECORD);//w w  w. ja  v  a2s.  c om

    for (File file : inDir.listFiles()) {
        if (!file.isFile()) {
            System.out.println("Skipping " + file + " (not a file) ...");
            continue;
        }

        FileInputStream fileIn = new FileInputStream(file);
        ByteArrayOutputStream bytesOut = new ByteArrayOutputStream((int) file.length());
        int b;
        while (-1 != (b = fileIn.read())) {
            bytesOut.write(b);
        }
        fileIn.close();
        bytesOut.close();
        byte[] bytes = bytesOut.toByteArray();

        val.set(bytes, 0, bytes.length);
        key.set(file.getName());

        writer.append(key, val);
    }
    writer.close();

    return 0;
}

From source file:io.covert.binary.analysis.BuildSequenceFileFromTarball.java

License:Apache License

public void load(FileSystem fs, Configuration conf, File inputTarball, Path outputDir) throws Exception {
    Text key = new Text();
    BytesWritable val = new BytesWritable();

    Path sequenceName = new Path(outputDir, inputTarball.getName() + ".seq");
    System.out.println("Writing to " + sequenceName);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, sequenceName, Text.class,
            BytesWritable.class, CompressionType.RECORD);

    InputStream is = new FileInputStream(inputTarball);
    if (inputTarball.toString().toLowerCase().endsWith(".gz")) {
        is = new GZIPInputStream(is);
    } else if (inputTarball.toString().toLowerCase().endsWith(".bz")
            || inputTarball.toString().endsWith(".bz2")) {
        is.read(); // read 'B'
        is.read(); // read 'Z'
        is = new CBZip2InputStream(is);
    }//  w w w .j ava 2 s  .co m

    final TarArchiveInputStream debInputStream = (TarArchiveInputStream) new ArchiveStreamFactory()
            .createArchiveInputStream("tar", is);
    TarArchiveEntry entry = null;
    while ((entry = (TarArchiveEntry) debInputStream.getNextEntry()) != null) {
        if (!entry.isDirectory()) {

            try {
                final ByteArrayOutputStream outputFileStream = new ByteArrayOutputStream();
                IOUtils.copy(debInputStream, outputFileStream);
                outputFileStream.close();
                byte[] outputFile = outputFileStream.toByteArray();
                val.set(outputFile, 0, outputFile.length);

                MessageDigest md = MessageDigest.getInstance("MD5");
                md.update(outputFile);
                byte[] digest = md.digest();
                String hexdigest = "";
                for (int i = 0; i < digest.length; i++) {
                    hexdigest += Integer.toString((digest[i] & 0xff) + 0x100, 16).substring(1);
                }
                key.set(hexdigest);
                writer.append(key, val);
            } catch (IOException e) {
                System.err.println("Warning: tarball may be truncated: " + inputTarball);
                // Truncated Tarball
                break;
            }
        }
    }
    debInputStream.close();
    writer.close();
}

From source file:io.covert.binary.analysis.BuildTarBzSequenceFile.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    File inDir = new File(args[0]);
    Path name = new Path(args[1]);

    Text key = new Text();
    BytesWritable val = new BytesWritable();

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    if (!fs.exists(name)) {
        fs.mkdirs(name);//w w  w .  j a  va  2  s.  c  om
    }
    for (File file : inDir.listFiles()) {
        Path sequenceName = new Path(name, file.getName() + ".seq");
        System.out.println("Writing to " + sequenceName);
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, sequenceName, Text.class,
                BytesWritable.class, CompressionType.RECORD);
        if (!file.isFile()) {
            System.out.println("Skipping " + file + " (not a file) ...");
            continue;
        }

        final InputStream is = new FileInputStream(file);
        final TarArchiveInputStream debInputStream = (TarArchiveInputStream) new ArchiveStreamFactory()
                .createArchiveInputStream("tar", is);
        TarArchiveEntry entry = null;
        while ((entry = (TarArchiveEntry) debInputStream.getNextEntry()) != null) {
            if (!entry.isDirectory()) {

                final ByteArrayOutputStream outputFileStream = new ByteArrayOutputStream();
                IOUtils.copy(debInputStream, outputFileStream);
                outputFileStream.close();
                byte[] outputFile = outputFileStream.toByteArray();
                val.set(outputFile, 0, outputFile.length);

                MessageDigest md = MessageDigest.getInstance("MD5");
                md.update(outputFile);
                byte[] digest = md.digest();
                String hexdigest = "";
                for (int i = 0; i < digest.length; i++) {
                    hexdigest += Integer.toString((digest[i] & 0xff) + 0x100, 16).substring(1);
                }
                key.set(hexdigest);
                writer.append(key, val);
            }
        }
        debInputStream.close();
        writer.close();
    }

    return 0;
}

From source file:io.github.thammegowda.Local2SeqFile.java

License:Apache License

private void writeOutput(RemoteIterator<? extends FileStatus> input) throws IOException {
    Path outPath = new Path(output);
    if (distribFs.exists(outPath)) {
        throw new IllegalArgumentException("Output file already exists, Not overwriting it:" + output);
    }//w  w w.j a  v  a 2 s  .c  om

    Writer writer = SequenceFile.createWriter(distribFs.getConf(), Writer.file(outPath),
            Writer.keyClass(Text.class), Writer.valueClass(BytesWritable.class),
            Writer.compression(SequenceFile.CompressionType.RECORD));
    Text key = new Text();
    BytesWritable value = new BytesWritable();
    long skipped = 0;
    long copied = 0;
    while (input.hasNext()) {
        FileStatus next = input.next();
        if (filter(next)) {
            key.set(next.getPath().toString());
            FSDataInputStream stream = localFs.open(next.getPath());
            //CAUTION : this could cause memory overflow
            byte[] bytes = IOUtils.toByteArray(stream);
            value.set(bytes, 0, bytes.length);
            writer.append(key, value);
            copied++;
        } else {
            skipped++;
        }
    }
    writer.close();
    System.out.println("Files copied ::" + copied);
    System.out.println("Files skipped ::" + skipped);
}

From source file:io.prestosql.plugin.accumulo.io.AccumuloRecordCursor.java

License:Apache License

public AccumuloRecordCursor(AccumuloRowSerializer serializer, BatchScanner scanner, String rowIdName,
        List<AccumuloColumnHandle> columnHandles, List<AccumuloColumnConstraint> constraints) {
    this.columnHandles = requireNonNull(columnHandles, "columnHandles is null");
    this.scanner = requireNonNull(scanner, "scanner is null");
    this.serializer = requireNonNull(serializer, "serializer is null");
    this.serializer.setRowIdName(requireNonNull(rowIdName, "rowIdName is null"));

    requireNonNull(columnHandles, "columnHandles is null");
    requireNonNull(constraints, "constraints is null");

    if (retrieveOnlyRowIds(rowIdName)) {
        this.scanner.addScanIterator(new IteratorSetting(1, "firstentryiter", FirstEntryInRowIterator.class));

        fieldToColumnName = new String[1];
        fieldToColumnName[0] = rowIdName;

        // Set a flag on the serializer saying we are only going to be retrieving the row ID
        this.serializer.setRowOnly(true);
    } else {/*from  ww w. j av a 2s  .c  o m*/
        // Else, we will be scanning some more columns here
        this.serializer.setRowOnly(false);

        // Fetch the reserved row ID column
        this.scanner.fetchColumn(ROW_ID_COLUMN, ROW_ID_COLUMN);

        Text family = new Text();
        Text qualifier = new Text();

        // Create an array which maps the column ordinal to the name of the column
        fieldToColumnName = new String[columnHandles.size()];
        for (int i = 0; i < columnHandles.size(); ++i) {
            AccumuloColumnHandle columnHandle = columnHandles.get(i);
            fieldToColumnName[i] = columnHandle.getName();

            // Make sure to skip the row ID!
            if (!columnHandle.getName().equals(rowIdName)) {
                // Set the mapping of presto column name to the family/qualifier
                this.serializer.setMapping(columnHandle.getName(), columnHandle.getFamily().get(),
                        columnHandle.getQualifier().get());

                // Set our scanner to fetch this family/qualifier column
                // This will help us prune which data we receive from Accumulo
                family.set(columnHandle.getFamily().get());
                qualifier.set(columnHandle.getQualifier().get());
                this.scanner.fetchColumn(family, qualifier);
            }
        }
    }

    IteratorSetting setting = new IteratorSetting(WHOLE_ROW_ITERATOR_PRIORITY, WholeRowIterator.class);
    scanner.addScanIterator(setting);

    iterator = this.scanner.iterator();
}

From source file:io.prestosql.plugin.accumulo.serializers.StringRowSerializer.java

License:Apache License

@Override
public void setDate(Text text, Date value) {
    text.set(Long.toString(MILLISECONDS.toDays(value.getTime())).getBytes(UTF_8));
}

From source file:io.sanfran.wikiTrends.extraction.hadoop.FileNameLineRecordReader.java

License:Open Source License

/** Read a line. */
public synchronized boolean next(Text key, Text value) throws IOException {

    try {/*  ww  w.  j  a  v  a  2s .  c  o  m*/
        // We always read one extra line, which lies outside the upper
        // split limit i.e. (end - 1)
        while (getFilePosition() <= end) {
            key.set(fileName);

            int newSize = in.readLine(value, maxLineLength, Math.max(maxBytesToConsume(pos), maxLineLength));
            if (newSize == 0) {
                return false;
            }
            pos += newSize;
            if (newSize < maxLineLength) {
                return true;
            }

            // line too long. try again
            LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
        }
    } catch (EOFException corruptFileException) {
        LOG.warn("corrupt file: " + fileName);
    }

    return false;
}

From source file:it.uniroma1.bdc.piccioli.tesi.TriangleCounting.java

License:Apache License

@Override
public void compute(Vertex<Text, Text, NullWritable> vertex, Iterable<Text> messages) throws IOException {

    if (getSuperstep() == 0) {

        //costruisco Text lista vicini
        Text neigborhood = new Text();

        Iterable<Edge<Text, NullWritable>> edges = vertex.getEdges();

        for (Edge<Text, NullWritable> edge : edges) {
            neigborhood.set(neigborhood.toString() + "-" + edge.getTargetVertexId().toString());
        }/*from  ww w  . j a  v a 2s .c o m*/

        for (Edge<Text, NullWritable> edge : edges) {
            this.sendMessage(edge.getTargetVertexId(), neigborhood);
        }

    } else if (getSuperstep() == 1) {

        Double T = 0.0;

        //confronto edge "mancanti" inviati con lista vicini nodo
        for (Text message : messages) {
            String[] msgSplit = message.toString().split("-");//lista neigbohood

            Iterable<Edge<Text, NullWritable>> edges = vertex.getEdges();

            for (Edge<Text, NullWritable> edge : edges) {
                for (String missEdge : msgSplit) {
                    if (missEdge.equals(edge.getTargetVertexId().toString())) {
                        T++;
                    }
                }
            }

        }

        T = T / 6;

        //send aggragate value
        aggregate(SOMMA, new DoubleWritable(T));

        vertex.setValue(new Text(T.toString()));
        vertex.voteToHalt();

    }

}

From source file:it.uniroma1.bdc.tesi.piccioli.giraphstandalone.trash.TriangleCount_OLD.java

License:Apache License

@Override
public void compute(Vertex<Text, Text, NullWritable> vertex, Iterable<Text> messages) throws IOException {

    if (getSuperstep() == 0) {

        //costruisco Text lista vicini
        Text neigborhood = new Text();

        Iterable<Edge<Text, NullWritable>> edges = vertex.getEdges();

        for (Edge<Text, NullWritable> edge : edges) {
            neigborhood.set(neigborhood.toString() + "-" + edge.getTargetVertexId().toString());
        }//from www. j a  v a  2 s  . c o m

        for (Edge<Text, NullWritable> edge : edges) {
            this.sendMessage(edge.getTargetVertexId(), neigborhood);
        }

    } else if (getSuperstep() == 1) {

        Double T = 0.0;
        Iterable<Edge<Text, NullWritable>> edges = vertex.getEdges();
        //confronto edge "mancanti" inviati con lista vicini nodo
        for (Text message : messages) {
            String[] msgSplit = message.toString().split("-");//lista neigbohood

            for (Edge<Text, NullWritable> edge : edges) {
                for (String missEdge : msgSplit) {
                    if (missEdge.equals(edge.getTargetVertexId().toString())) {
                        T++;
                    }
                }
            }

        }

        T = T / 6;
        vertex.setValue(new Text(T.toString()));
        vertex.voteToHalt();

        aggregate(SOMMA, new DoubleWritable(T));
        getAggregatedValue(SOMMA);
        System.out.println("DEBUG " + getAggregatedValue(SOMMA));

    }

}

From source file:jadoop.HadoopGridJob.java

License:Open Source License

/**
 * Create a tasks.seq sequence file in the input directory for each task.
 * This file contains the key and command that defines the map task. The key
 * is the key that was associated with the task when it was added to the
 * job. The value is an TextArrayWritable object with the following
 * contents://  w  w  w. j a  v a 2 s.  co  m
 * <UL>
 * <LI>true/false - indicating if standard output is to be captured.
 * <LI>true/false - indicating if standard error is to be captured.
 * <LI>cmd - the command to be run in the mapper task.
 * <LI>... - any successive elements contains an argument to the cmd.
 * </UL>
 * 
 * @see HadoopGridTaskRunner
 * 
 * @param hdfsInputDir
 *            the input directory on the HDFS where the tasks.seq file is to
 *            be created.
 * 
 * @throws IOException
 *             if there is a problem creating the tasks.seq file.
 */
private void writeTasksSequenceFiles(Path hdfsInputDir) throws IOException {
    /*
     * Seems as if we should be able to just write one task file with
     * multiple key/value pairs in it. However, hadoop did not seem to want
     * to send each entry to a different node. Rather one node processed
     * many of the tasks. It seems as if this could be fixed by defining how
     * hadoop is to split up the sequence file, but we were unable to get
     * that to work. Writing a different task file for each task is a bit of
     * a hack solution, but it works. Each task is then run on a different
     * node, as desired.
     */

    Text mapperKey = new Text();
    TextArrayWritable mapperVal = new TextArrayWritable();

    // for each task in the job...
    int index = 0;
    for (HadoopGridTask hgt : taskMap.values()) {

        Path seqFileInDirPath = new Path(hdfsInputDir.toString() + "/tasks" + index + ".seq");

        SequenceFile.Writer writer = SequenceFile.createWriter(job.getConfiguration(),
                Writer.file(seqFileInDirPath), Writer.keyClass(Text.class),
                Writer.valueClass(TextArrayWritable.class));

        String taskKey = hgt.getKey();
        String[] taskVal = hgt.getCommand();

        // set the key for sequence file entry for this task
        mapperKey.set(taskKey);

        /*
         * Build an array of Writeable holding the flags that indicate if
         * standard output/error are to be captured, the timeout and the
         * command and its arguments.
         */
        Writable[] vals = new Writable[taskVal.length + 3];

        // put the flags in the array.
        vals[0] = new Text(String.valueOf(hgt.captureStandardOutput()));
        vals[1] = new Text(String.valueOf(hgt.captureStandardError()));
        vals[2] = new Text(String.valueOf(hgt.getTimeout()));

        // put the command and its arguments into the array.
        for (int i = 3; i < taskVal.length + 3; i++) {
            vals[i] = new Text(taskVal[i - 3]);
        }

        /*
         * Set the value for the sequence file entry for this task to be the
         * array.
         */
        mapperVal.set(vals);

        writer.append(mapperKey, mapperVal);

        writer.close();

        index++;
    }
}