Example usage for org.apache.hadoop.io Text set

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text set.

Prototype

public void set(Text other)

Source Link

Document

copy a text.

Usage

From source file:uk.bl.wa.hadoop.ArchiveFileRecordReader.java

License:Open Source License

@Override
public boolean next(Text key, WritableArchiveRecord value) throws IOException {
    boolean found = false;
    while (!found) {
        boolean hasNext = false;
        try {/*from   w  w  w.  j  a  v  a 2  s . c  om*/
            hasNext = iterator.hasNext();
        } catch (Throwable e) {
            log.error("ERROR in hasNext():  " + this.archiveName + ": " + e.toString());
            hasNext = false;
        }
        try {
            if (hasNext) {
                record = (ArchiveRecord) iterator.next();
                found = true;
                key.set(this.archiveName);
                value.setRecord(record);
            } else if (!this.nextFile()) {
                break;
            }
        } catch (Throwable e) {
            found = false;
            log.error("ERROR reading " + this.archiveName, e);
            // Reached the end of the file? If so move on or exit:
            if (e.getCause() instanceof EOFException) {
                log.error("EOF while reading " + this.archiveName);
                if (!this.nextFile()) {
                    break;
                }
            }
        }
    }
    return found;
}

From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXMapperTest.java

License:Open Source License

@Test
public void testMapper() throws IOException, JSONException {

    Set<String> skippableRecords = new HashSet<String>();
    skippableRecords.add("application/warc-fields");
    skippableRecords.add("text/dns");

    File inputFile = new File(
            "../warc-indexer/src/test/resources/gov.uk-revisit-warcs/BL-20140325121225068-00000-32090~opera~8443.warc.gz");
    String archiveName = inputFile.getName();

    ArchiveReader reader = ArchiveReaderFactory.get(inputFile);
    Iterator<ArchiveRecord> ir = reader.iterator();
    ArchiveRecord record;/*w  ww .  j  av  a 2s.  c  o  m*/
    Text key = new Text();
    WritableArchiveRecord value = new WritableArchiveRecord();
    while (ir.hasNext()) {
        record = (ArchiveRecord) ir.next();
        key.set(archiveName);
        value.setRecord(record);

        LOG.info("GOT: " + record.getHeader().getRecordIdentifier());
        LOG.info("GOT: " + record.getHeader().getMimetype());
        // Skip records that can't be analysed:
        if (skippableRecords.contains(record.getHeader().getMimetype()))
            continue;

        // Run through them all:
        LOG.info("Running without testing output...");
        mapDriver.setInput(key, value);
        List<Pair<Text, Text>> result = mapDriver.run();
        if (result != null && result.size() > 0) {
            MDX mdx = new MDX(result.get(0).getSecond().toString());
            LOG.info("RESULT MDX: " + mdx);

            // Perform a specific check for one of the items:
            if ("http://data.gov.uk/".equals(Normalisation.sanitiseWARCHeaderValue(record.getHeader().getUrl()))
                    && record.getHeader().getMimetype().contains("response")) {
                Text testKey = new Text("sha1:SKAVWVVB6HYPSTY3YNQJVM2C4FZRWBSG");
                MDX testMdx = new MDX(
                        "{\"digest\":\"sha1:SKAVWVVB6HYPSTY3YNQJVM2C4FZRWBSG\",\"url\":\"http://data.gov.uk/\",\"timestamp\":\"20140325121238\"}");
                assertEquals(testKey, result.get(0).getFirst());
                assertEquals(testMdx.getUrl(), mdx.getUrl());
                assertEquals(testMdx.getHash(), mdx.getHash());
                assertEquals(testMdx.getTs(), mdx.getTs());
            }

        }
        mapDriver.resetOutput();
    }
}

From source file:uk.bl.wa.hadoop.mapred.FrequencyCountingReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterator<Text> iterator, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {

    log.warn("Reducing for key: " + key);

    // Use a simple set to collect only distinct results for this key:
    Map<String, MutableInt> matches = new HashMap<String, MutableInt>();
    while (iterator.hasNext()) {
        String m = iterator.next().toString();
        // Get or set up the counter:
        MutableInt value = matches.get(m);
        if (value == null) {
            value = new MutableInt();
            matches.put(m, value);//from  ww w.  ja va 2 s  . c o  m
        }
        // Increment the counter for this match:
        value.inc();
    }

    // Loop through and collect all distinct matches:
    Text result = new Text();
    Text outKey = key;
    OutputCollector<Text, Text> collector;
    int pos = key.find("__");
    if (pos == -1) {
        collector = output;
    } else {
        String[] fp = key.toString().split("__");
        collector = mos.getCollector(fp[0], fp[1], reporter);
        outKey = new Text(fp[1]);
    }
    log.info("For key: " + key + " outKey " + outKey + " outputting " + matches.size() + " unique values.");
    for (String match : matches.keySet()) {
        // This outputs the count:
        result.set(match + "\t" + matches.get(match).get());
        // And collect:
        collector.collect(outKey, result);
    }

}

From source file:uk.bl.wa.hadoop.mapreduce.cdx.TinyCDXServerMapper.java

License:Open Source License

/**
 * /*from w  w w  .  java  2  s.  co  m*/
 * @param args
 * @throws Exception
 */
public static void main(String[] args) throws Exception {

    File inputFile = new File(args[0]);
    ArchiveReader arcreader = ArchiveReaderFactory.get(inputFile);
    arcreader.setStrict(false);
    WarcIndexer warcIndexer = new WarcIndexer();
    ArcIndexer arcIndexer = new ArcIndexer();
    Iterator<CaptureSearchResult> archiveIterator;
    if (inputFile.getName().matches("^.+\\.warc(\\.gz)?$")) {
        archiveIterator = warcIndexer.iterator((WARCReader) arcreader);
    } else {
        archiveIterator = arcIndexer.iterator((ARCReader) arcreader);
    }
    Iterator<String> cdxlines = SearchResultToCDXFormatAdapter.adapt(archiveIterator,
            new CDXFormat(DereferencingArchiveToCDXRecordReader.CDX_11));

    // Test it:
    TinyCDXServerMapper mapper = new TinyCDXServerMapper();
    mapper.tcs = new TinyCDXSender("http://localhost:9090/t3", 20);
    Text cdxline = new Text();
    while (cdxlines.hasNext()) {
        cdxline.set(cdxlines.next());
        mapper.map(cdxline, cdxline, null);
    }
    mapper.tcs.close();

}

From source file:weka.distributed.hadoop.CorrelationMatrixHadoopMapper.java

License:Open Source License

@Override
public void cleanup(Context context) throws IOException, InterruptedException {

    // output all the rows in this partial matrix
    double[][] partialMatrix = m_task.getMatrix();
    int[][] coOcc = m_task.getCoOccurrenceCounts();

    for (int i = 0; i < partialMatrix.length; i++) {
        double[] row = partialMatrix[i];
        int[] co = null;
        if (coOcc != null) {
            co = coOcc[i];/*  w w  w .ja  v a 2  s.  c  om*/
        }
        MatrixRowHolder rh = new MatrixRowHolder(i, row, co);
        byte[] bytes = rowHolderToBytes(rh);

        String sKey = ("" + i);
        Text key = new Text();
        key.set(sKey);

        BytesWritable value = new BytesWritable();
        value.set(bytes, 0, bytes.length);

        context.write(key, value);
    }
}

From source file:weka.distributed.hadoop.CorrelationMatrixRowHadoopReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException {
    List<MatrixRowHolder> rowsToAgg = new ArrayList<MatrixRowHolder>();

    try {/*  w w w  .  j a  v a  2s  .c  o m*/
        for (BytesWritable b : values) {
            byte[] bytes = b.getBytes();

            rowsToAgg.add(deserialize(bytes));
        }
    } catch (ClassNotFoundException ex) {
        throw new IOException(ex);
    }

    if (rowsToAgg.size() > 0) {

        int rowNum = rowsToAgg.get(0).getRowNumber();

        List<double[]> rows = new ArrayList<double[]>();
        List<int[]> coOcc = null;
        if (!m_missingsWereReplacedWithMeans) {
            coOcc = new ArrayList<int[]>();
        }

        for (MatrixRowHolder r : rowsToAgg) {
            if (r.getRowNumber() != rowNum) {
                throw new IOException("Matrix row numbers for this key appear to differ!");
            }
            rows.add(r.getRow());
            if (!m_missingsWereReplacedWithMeans) {
                coOcc.add(r.getCoOccurrencesCounts());
            }
        }
        try {
            double[] aggregated = m_task.aggregate(rowsToAgg.get(0).getRowNumber(), rows, coOcc,
                    m_headerWithSummaryAtts, m_missingsWereReplacedWithMeans, m_covariance, m_deleteClassIfSet);

            // assemble Text key (row num) and Text row (space separated
            // values)

            Text outKey = new Text();
            outKey.set("" + rowNum);

            StringBuilder b = new StringBuilder();
            for (int i = 0; i < aggregated.length; i++) {
                if (i < aggregated.length - 1) {
                    b.append("" + aggregated[i]).append(" ");
                } else {
                    b.append("" + aggregated[i]);
                }
            }

            Text outVal = new Text();
            outVal.set(b.toString());
            context.write(outKey, outVal);
        } catch (DistributedWekaException e) {
            throw new IOException(e);
        } catch (InterruptedException e) {
            throw new IOException(e);
        }
    }
}

From source file:weka.distributed.hadoop.CSVToArffHeaderHadoopMapper.java

License:Open Source License

@Override
public void cleanup(Context context) throws IOException, InterruptedException {

    if (m_fatalMappingError != null) {
        throw m_fatalMappingError;
    }//  ww  w  .j  av a  2s . co  m

    HeaderAndQuantileDataHolder holder = null;
    Instances header = null;
    if (!m_estimateQuantiles) {
        header = m_task.getHeader();
    } else {
        try {
            holder = m_task.getHeaderAndQuantileEstimators();
        } catch (DistributedWekaException ex) {
            throw new IOException(ex);
        }
    }

    ByteArrayOutputStream ostream = new ByteArrayOutputStream();
    OutputStream os = ostream;
    ObjectOutputStream p;

    p = new ObjectOutputStream(new BufferedOutputStream(new GZIPOutputStream(os)));
    p.writeObject(header != null ? header : holder);
    p.flush();
    p.close();

    byte[] bytes = ostream.toByteArray();
    // make sure all headers go to the same reducer
    String contantKey = "header";

    Text key = new Text();
    key.set(contantKey);
    BytesWritable value = new BytesWritable();
    value.set(bytes, 0, bytes.length);
    context.write(key, value); // write the header
}

From source file:weka.distributed.hadoop.CSVToArffHeaderHadoopReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException {
    Configuration conf = context.getConfiguration();
    String outputDestination = conf.get(CSV_TO_ARFF_HEADER_WRITE_PATH);

    if (outputDestination == null || outputDestination.length() == 0) {
        throw new IOException("No destination given for aggregated ARFF header");
    }//from w ww  . j  a  va2s. c  om

    List<Instances> headersToAgg = new ArrayList<Instances>();
    List<HeaderAndQuantileDataHolder> holdersToAgg = new ArrayList<HeaderAndQuantileDataHolder>();

    int counter = 0;
    try {
        for (BytesWritable b : values) {
            byte[] bytes = b.getBytes();
            if (m_estimateQuantiles) {
                HeaderAndQuantileDataHolder holder = deserializeHolder(bytes);
                holdersToAgg.add(holder);
            } else {
                Instances aHeader = deserializeHeader(bytes);
                headersToAgg.add(aHeader);
            }
            counter++;
        }
    } catch (Exception ex) {
        throw new IOException(ex);
    }

    try {
        Instances aggregated = m_estimateQuantiles ? m_task.aggregateHeadersAndQuartiles(holdersToAgg)
                : m_task.aggregate(headersToAgg);
        writeHeaderToDestination(aggregated, outputDestination, conf);

        Text outkey = new Text();
        outkey.set("AKey");
        Text outval = new Text();
        outval.set("Num headers aggregated " + counter);
        context.write(outkey, outval);

    } catch (Exception e) {
        throw new IOException(e);
    }
}

From source file:weka.distributed.hadoop.KMeansCentroidSketchHadoopMapper.java

License:Open Source License

@Override
public void cleanup(Context context) throws IOException, InterruptedException {
    // emit serialized sketch tasks with run number as key
    for (int i = 0; i < m_tasks.length; i++) {
        System.err.println("Number of instances in sketch: " + m_tasks[i].getCurrentSketch().numInstances());
        System.err.println(//from www  . j a  va2 s .c o m
                "Number of instances in reservoir: " + m_tasks[i].getReservoirSample().getSample().size());
        byte[] bytes = sketchToBytes(m_tasks[i]);
        String runNum = "run" + i;
        Text key = new Text();
        key.set(runNum);
        BytesWritable value = new BytesWritable();
        value.set(bytes, 0, bytes.length);
        context.write(key, value);
    }
}

From source file:weka.distributed.hadoop.KMeansCentroidSketchHadoopReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException {

    int runNum = 0;
    String rS = key.toString();/*  w  w w  .j  a v  a2s  .  com*/
    rS = rS.replace("run", "");
    try {
        runNum = Integer.parseInt(rS);
    } catch (NumberFormatException ex) {
        throw new IOException(ex);
    }
    CentroidSketch initial = null;

    List<NormalizableDistance> distsForRun = new ArrayList<NormalizableDistance>();
    try {
        for (BytesWritable b : values) {
            byte[] bytes = b.getBytes();

            CentroidSketch current = deserialize(bytes);
            if (initial == null) {
                initial = current;
            } else {
                initial.aggregateReservoir(current.getReservoirSample());
            }

            if (m_isFirstIteration) {
                distsForRun.add(current.getDistanceFunction());
            }
        }

        // add the reservoir to the current sketch
        initial.addReservoirToCurrentSketch();

        // update the distance function with global numeric
        // attribute ranges
        if (m_isFirstIteration) {
            Instances distancePrimingData = KMeansReduceTask
                    .computeDistancePrimingDataFromDistanceFunctions(distsForRun, m_transformedHeaderNoSummary);

            initial.getDistanceFunction().setInstances(distancePrimingData);
        }

        // save the sketch out
        writeSketchToDestination(initial, m_outputDestination, runNum, context.getConfiguration());

        System.err.println("Number of instances in sketch for run " + runNum + ": "
                + initial.getCurrentSketch().numInstances());
        Text outKey = new Text();
        outKey.set("Summary:\n");
        Text outVal = new Text();
        outVal.set("Number of instances in sketch for run " + runNum + ": "
                + initial.getCurrentSketch().numInstances());

        context.write(outKey, outVal);
    } catch (Exception ex) {
        throw new IOException(ex);
    }
}