Example usage for org.apache.hadoop.io Text set

List of usage examples for org.apache.hadoop.io Text set

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text set.

Prototype

public void set(Text other) 

Source Link

Document

copy a text.

Usage

From source file:io.covert.binary.analysis.BuildSequenceFile.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    File inDir = new File(args[0]);
    Path name = new Path(args[1]);

    Text key = new Text();
    BytesWritable val = new BytesWritable();

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, name, Text.class, BytesWritable.class,
            CompressionType.RECORD);//w w  w. ja  v  a2s.  c om

    for (File file : inDir.listFiles()) {
        if (!file.isFile()) {
            System.out.println("Skipping " + file + " (not a file) ...");
            continue;
        }

        FileInputStream fileIn = new FileInputStream(file);
        ByteArrayOutputStream bytesOut = new ByteArrayOutputStream((int) file.length());
        int b;
        while (-1 != (b = fileIn.read())) {
            bytesOut.write(b);
        }
        fileIn.close();
        bytesOut.close();
        byte[] bytes = bytesOut.toByteArray();

        val.set(bytes, 0, bytes.length);
        key.set(file.getName());

        writer.append(key, val);
    }
    writer.close();

    return 0;
}

From source file:io.covert.binary.analysis.BuildSequenceFileFromTarball.java

License:Apache License

public void load(FileSystem fs, Configuration conf, File inputTarball, Path outputDir) throws Exception {
    Text key = new Text();
    BytesWritable val = new BytesWritable();

    Path sequenceName = new Path(outputDir, inputTarball.getName() + ".seq");
    System.out.println("Writing to " + sequenceName);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, sequenceName, Text.class,
            BytesWritable.class, CompressionType.RECORD);

    InputStream is = new FileInputStream(inputTarball);
    if (inputTarball.toString().toLowerCase().endsWith(".gz")) {
        is = new GZIPInputStream(is);
    } else if (inputTarball.toString().toLowerCase().endsWith(".bz")
            || inputTarball.toString().endsWith(".bz2")) {
        is.read(); // read 'B'
        is.read(); // read 'Z'
        is = new CBZip2InputStream(is);
    }//  w w w .j ava 2 s  .co m

    final TarArchiveInputStream debInputStream = (TarArchiveInputStream) new ArchiveStreamFactory()
            .createArchiveInputStream("tar", is);
    TarArchiveEntry entry = null;
    while ((entry = (TarArchiveEntry) debInputStream.getNextEntry()) != null) {
        if (!entry.isDirectory()) {

            try {
                final ByteArrayOutputStream outputFileStream = new ByteArrayOutputStream();
                IOUtils.copy(debInputStream, outputFileStream);
                outputFileStream.close();
                byte[] outputFile = outputFileStream.toByteArray();
                val.set(outputFile, 0, outputFile.length);

                MessageDigest md = MessageDigest.getInstance("MD5");
                md.update(outputFile);
                byte[] digest = md.digest();
                String hexdigest = "";
                for (int i = 0; i < digest.length; i++) {
                    hexdigest += Integer.toString((digest[i] & 0xff) + 0x100, 16).substring(1);
                }
                key.set(hexdigest);
                writer.append(key, val);
            } catch (IOException e) {
                System.err.println("Warning: tarball may be truncated: " + inputTarball);
                // Truncated Tarball
                break;
            }
        }
    }
    debInputStream.close();
    writer.close();
}

From source file:io.covert.binary.analysis.BuildTarBzSequenceFile.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    File inDir = new File(args[0]);
    Path name = new Path(args[1]);

    Text key = new Text();
    BytesWritable val = new BytesWritable();

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    if (!fs.exists(name)) {
        fs.mkdirs(name);//w w  w .  j a  va  2  s.  c  om
    }
    for (File file : inDir.listFiles()) {
        Path sequenceName = new Path(name, file.getName() + ".seq");
        System.out.println("Writing to " + sequenceName);
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, sequenceName, Text.class,
                BytesWritable.class, CompressionType.RECORD);
        if (!file.isFile()) {
            System.out.println("Skipping " + file + " (not a file) ...");
            continue;
        }

        final InputStream is = new FileInputStream(file);
        final TarArchiveInputStream debInputStream = (TarArchiveInputStream) new ArchiveStreamFactory()
                .createArchiveInputStream("tar", is);
        TarArchiveEntry entry = null;
        while ((entry = (TarArchiveEntry) debInputStream.getNextEntry()) != null) {
            if (!entry.isDirectory()) {

                final ByteArrayOutputStream outputFileStream = new ByteArrayOutputStream();
                IOUtils.copy(debInputStream, outputFileStream);
                outputFileStream.close();
                byte[] outputFile = outputFileStream.toByteArray();
                val.set(outputFile, 0, outputFile.length);

                MessageDigest md = MessageDigest.getInstance("MD5");
                md.update(outputFile);
                byte[] digest = md.digest();
                String hexdigest = "";
                for (int i = 0; i < digest.length; i++) {
                    hexdigest += Integer.toString((digest[i] & 0xff) + 0x100, 16).substring(1);
                }
                key.set(hexdigest);
                writer.append(key, val);
            }
        }
        debInputStream.close();
        writer.close();
    }

    return 0;
}

From source file:io.github.thammegowda.Local2SeqFile.java

License:Apache License

private void writeOutput(RemoteIterator<? extends FileStatus> input) throws IOException {
    Path outPath = new Path(output);
    if (distribFs.exists(outPath)) {
        throw new IllegalArgumentException("Output file already exists, Not overwriting it:" + output);
    }//w  w w.j a  v  a 2 s  .c  om

    Writer writer = SequenceFile.createWriter(distribFs.getConf(), Writer.file(outPath),
            Writer.keyClass(Text.class), Writer.valueClass(BytesWritable.class),
            Writer.compression(SequenceFile.CompressionType.RECORD));
    Text key = new Text();
    BytesWritable value = new BytesWritable();
    long skipped = 0;
    long copied = 0;
    while (input.hasNext()) {
        FileStatus next = input.next();
        if (filter(next)) {
            key.set(next.getPath().toString());
            FSDataInputStream stream = localFs.open(next.getPath());
            //CAUTION : this could cause memory overflow
            byte[] bytes = IOUtils.toByteArray(stream);
            value.set(bytes, 0, bytes.length);
            writer.append(key, value);
            copied++;
        } else {
            skipped++;
        }
    }
    writer.close();
    System.out.println("Files copied ::" + copied);
    System.out.println("Files skipped ::" + skipped);
}

From source file:io.prestosql.plugin.accumulo.io.AccumuloRecordCursor.java

License:Apache License

public AccumuloRecordCursor(AccumuloRowSerializer serializer, BatchScanner scanner, String rowIdName,
        List<AccumuloColumnHandle> columnHandles, List<AccumuloColumnConstraint> constraints) {
    this.columnHandles = requireNonNull(columnHandles, "columnHandles is null");
    this.scanner = requireNonNull(scanner, "scanner is null");
    this.serializer = requireNonNull(serializer, "serializer is null");
    this.serializer.setRowIdName(requireNonNull(rowIdName, "rowIdName is null"));

    requireNonNull(columnHandles, "columnHandles is null");
    requireNonNull(constraints, "constraints is null");

    if (retrieveOnlyRowIds(rowIdName)) {
        this.scanner.addScanIterator(new IteratorSetting(1, "firstentryiter", FirstEntryInRowIterator.class));

        fieldToColumnName = new String[1];
        fieldToColumnName[0] = rowIdName;

        // Set a flag on the serializer saying we are only going to be retrieving the row ID
        this.serializer.setRowOnly(true);
    } else {/*from  ww w. j av a 2s  .c  o m*/
        // Else, we will be scanning some more columns here
        this.serializer.setRowOnly(false);

        // Fetch the reserved row ID column
        this.scanner.fetchColumn(ROW_ID_COLUMN, ROW_ID_COLUMN);

        Text family = new Text();
        Text qualifier = new Text();

        // Create an array which maps the column ordinal to the name of the column
        fieldToColumnName = new String[columnHandles.size()];
        for (int i = 0; i < columnHandles.size(); ++i) {
            AccumuloColumnHandle columnHandle = columnHandles.get(i);
            fieldToColumnName[i] = columnHandle.getName();

            // Make sure to skip the row ID!
            if (!columnHandle.getName().equals(rowIdName)) {
                // Set the mapping of presto column name to the family/qualifier
                this.serializer.setMapping(columnHandle.getName(), columnHandle.getFamily().get(),
                        columnHandle.getQualifier().get());

                // Set our scanner to fetch this family/qualifier column
                // This will help us prune which data we receive from Accumulo
                family.set(columnHandle.getFamily().get());
                qualifier.set(columnHandle.getQualifier().get());
                this.scanner.fetchColumn(family, qualifier);
            }
        }
    }

    IteratorSetting setting = new IteratorSetting(WHOLE_ROW_ITERATOR_PRIORITY, WholeRowIterator.class);
    scanner.addScanIterator(setting);

    iterator = this.scanner.iterator();
}

From source file:io.prestosql.plugin.accumulo.serializers.StringRowSerializer.java

License:Apache License

@Override
public void setDate(Text text, Date value) {
    text.set(Long.toString(MILLISECONDS.toDays(value.getTime())).getBytes(UTF_8));
}

From source file:io.sanfran.wikiTrends.extraction.hadoop.FileNameLineRecordReader.java

License:Open Source License

/** Read a line. */
public synchronized boolean next(Text key, Text value) throws IOException {

    try {/*  ww  w.  j  a  v  a  2s .  c  o  m*/
        // We always read one extra line, which lies outside the upper
        // split limit i.e. (end - 1)
        while (getFilePosition() <= end) {
            key.set(fileName);

            int newSize = in.readLine(value, maxLineLength, Math.max(maxBytesToConsume(pos), maxLineLength));
            if (newSize == 0) {
                return false;
            }
            pos += newSize;
            if (newSize < maxLineLength) {
                return true;
            }

            // line too long. try again
            LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
        }
    } catch (EOFException corruptFileException) {
        LOG.warn("corrupt file: " + fileName);
    }

    return false;
}

From source file:it.uniroma1.bdc.piccioli.tesi.TriangleCounting.java

License:Apache License

@Override
public void compute(Vertex<Text, Text, NullWritable> vertex, Iterable<Text> messages) throws IOException {

    if (getSuperstep() == 0) {

        //costruisco Text lista vicini
        Text neigborhood = new Text();

        Iterable<Edge<Text, NullWritable>> edges = vertex.getEdges();

        for (Edge<Text, NullWritable> edge : edges) {
            neigborhood.set(neigborhood.toString() + "-" + edge.getTargetVertexId().toString());
        }/*from  ww w  . j a  v a 2s .c o m*/

        for (Edge<Text, NullWritable> edge : edges) {
            this.sendMessage(edge.getTargetVertexId(), neigborhood);
        }

    } else if (getSuperstep() == 1) {

        Double T = 0.0;

        //confronto edge "mancanti" inviati con lista vicini nodo
        for (Text message : messages) {
            String[] msgSplit = message.toString().split("-");//lista neigbohood

            Iterable<Edge<Text, NullWritable>> edges = vertex.getEdges();

            for (Edge<Text, NullWritable> edge : edges) {
                for (String missEdge : msgSplit) {
                    if (missEdge.equals(edge.getTargetVertexId().toString())) {
                        T++;
                    }
                }
            }

        }

        T = T / 6;

        //send aggragate value
        aggregate(SOMMA, new DoubleWritable(T));

        vertex.setValue(new Text(T.toString()));
        vertex.voteToHalt();

    }

}

From source file:it.uniroma1.bdc.tesi.piccioli.giraphstandalone.trash.TriangleCount_OLD.java

License:Apache License

@Override
public void compute(Vertex<Text, Text, NullWritable> vertex, Iterable<Text> messages) throws IOException {

    if (getSuperstep() == 0) {

        //costruisco Text lista vicini
        Text neigborhood = new Text();

        Iterable<Edge<Text, NullWritable>> edges = vertex.getEdges();

        for (Edge<Text, NullWritable> edge : edges) {
            neigborhood.set(neigborhood.toString() + "-" + edge.getTargetVertexId().toString());
        }//from www. j a  v a  2 s  . c o m

        for (Edge<Text, NullWritable> edge : edges) {
            this.sendMessage(edge.getTargetVertexId(), neigborhood);
        }

    } else if (getSuperstep() == 1) {

        Double T = 0.0;
        Iterable<Edge<Text, NullWritable>> edges = vertex.getEdges();
        //confronto edge "mancanti" inviati con lista vicini nodo
        for (Text message : messages) {
            String[] msgSplit = message.toString().split("-");//lista neigbohood

            for (Edge<Text, NullWritable> edge : edges) {
                for (String missEdge : msgSplit) {
                    if (missEdge.equals(edge.getTargetVertexId().toString())) {
                        T++;
                    }
                }
            }

        }

        T = T / 6;
        vertex.setValue(new Text(T.toString()));
        vertex.voteToHalt();

        aggregate(SOMMA, new DoubleWritable(T));
        getAggregatedValue(SOMMA);
        System.out.println("DEBUG " + getAggregatedValue(SOMMA));

    }

}

From source file:jadoop.HadoopGridJob.java

License:Open Source License

/**
 * Create a tasks.seq sequence file in the input directory for each task.
 * This file contains the key and command that defines the map task. The key
 * is the key that was associated with the task when it was added to the
 * job. The value is an TextArrayWritable object with the following
 * contents://  w  w  w. j a  v a 2 s.  co  m
 * <UL>
 * <LI>true/false - indicating if standard output is to be captured.
 * <LI>true/false - indicating if standard error is to be captured.
 * <LI>cmd - the command to be run in the mapper task.
 * <LI>... - any successive elements contains an argument to the cmd.
 * </UL>
 * 
 * @see HadoopGridTaskRunner
 * 
 * @param hdfsInputDir
 *            the input directory on the HDFS where the tasks.seq file is to
 *            be created.
 * 
 * @throws IOException
 *             if there is a problem creating the tasks.seq file.
 */
private void writeTasksSequenceFiles(Path hdfsInputDir) throws IOException {
    /*
     * Seems as if we should be able to just write one task file with
     * multiple key/value pairs in it. However, hadoop did not seem to want
     * to send each entry to a different node. Rather one node processed
     * many of the tasks. It seems as if this could be fixed by defining how
     * hadoop is to split up the sequence file, but we were unable to get
     * that to work. Writing a different task file for each task is a bit of
     * a hack solution, but it works. Each task is then run on a different
     * node, as desired.
     */

    Text mapperKey = new Text();
    TextArrayWritable mapperVal = new TextArrayWritable();

    // for each task in the job...
    int index = 0;
    for (HadoopGridTask hgt : taskMap.values()) {

        Path seqFileInDirPath = new Path(hdfsInputDir.toString() + "/tasks" + index + ".seq");

        SequenceFile.Writer writer = SequenceFile.createWriter(job.getConfiguration(),
                Writer.file(seqFileInDirPath), Writer.keyClass(Text.class),
                Writer.valueClass(TextArrayWritable.class));

        String taskKey = hgt.getKey();
        String[] taskVal = hgt.getCommand();

        // set the key for sequence file entry for this task
        mapperKey.set(taskKey);

        /*
         * Build an array of Writeable holding the flags that indicate if
         * standard output/error are to be captured, the timeout and the
         * command and its arguments.
         */
        Writable[] vals = new Writable[taskVal.length + 3];

        // put the flags in the array.
        vals[0] = new Text(String.valueOf(hgt.captureStandardOutput()));
        vals[1] = new Text(String.valueOf(hgt.captureStandardError()));
        vals[2] = new Text(String.valueOf(hgt.getTimeout()));

        // put the command and its arguments into the array.
        for (int i = 3; i < taskVal.length + 3; i++) {
            vals[i] = new Text(taskVal[i - 3]);
        }

        /*
         * Set the value for the sequence file entry for this task to be the
         * array.
         */
        mapperVal.set(vals);

        writer.append(mapperKey, mapperVal);

        writer.close();

        index++;
    }
}