Example usage for org.apache.hadoop.io Text set

List of usage examples for org.apache.hadoop.io Text set

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text set.

Prototype

public void set(Text other) 

Source Link

Document

copy a text.

Usage

From source file:uk.bl.wa.hadoop.ArchiveFileRecordReader.java

License:Open Source License

@Override
public boolean next(Text key, WritableArchiveRecord value) throws IOException {
    boolean found = false;
    while (!found) {
        boolean hasNext = false;
        try {/*from   w  w  w.  j  a  v  a 2  s . c  om*/
            hasNext = iterator.hasNext();
        } catch (Throwable e) {
            log.error("ERROR in hasNext():  " + this.archiveName + ": " + e.toString());
            hasNext = false;
        }
        try {
            if (hasNext) {
                record = (ArchiveRecord) iterator.next();
                found = true;
                key.set(this.archiveName);
                value.setRecord(record);
            } else if (!this.nextFile()) {
                break;
            }
        } catch (Throwable e) {
            found = false;
            log.error("ERROR reading " + this.archiveName, e);
            // Reached the end of the file? If so move on or exit:
            if (e.getCause() instanceof EOFException) {
                log.error("EOF while reading " + this.archiveName);
                if (!this.nextFile()) {
                    break;
                }
            }
        }
    }
    return found;
}

From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXMapperTest.java

License:Open Source License

@Test
public void testMapper() throws IOException, JSONException {

    Set<String> skippableRecords = new HashSet<String>();
    skippableRecords.add("application/warc-fields");
    skippableRecords.add("text/dns");

    File inputFile = new File(
            "../warc-indexer/src/test/resources/gov.uk-revisit-warcs/BL-20140325121225068-00000-32090~opera~8443.warc.gz");
    String archiveName = inputFile.getName();

    ArchiveReader reader = ArchiveReaderFactory.get(inputFile);
    Iterator<ArchiveRecord> ir = reader.iterator();
    ArchiveRecord record;/*w  ww .  j  av  a 2s.  c  o  m*/
    Text key = new Text();
    WritableArchiveRecord value = new WritableArchiveRecord();
    while (ir.hasNext()) {
        record = (ArchiveRecord) ir.next();
        key.set(archiveName);
        value.setRecord(record);

        LOG.info("GOT: " + record.getHeader().getRecordIdentifier());
        LOG.info("GOT: " + record.getHeader().getMimetype());
        // Skip records that can't be analysed:
        if (skippableRecords.contains(record.getHeader().getMimetype()))
            continue;

        // Run through them all:
        LOG.info("Running without testing output...");
        mapDriver.setInput(key, value);
        List<Pair<Text, Text>> result = mapDriver.run();
        if (result != null && result.size() > 0) {
            MDX mdx = new MDX(result.get(0).getSecond().toString());
            LOG.info("RESULT MDX: " + mdx);

            // Perform a specific check for one of the items:
            if ("http://data.gov.uk/".equals(Normalisation.sanitiseWARCHeaderValue(record.getHeader().getUrl()))
                    && record.getHeader().getMimetype().contains("response")) {
                Text testKey = new Text("sha1:SKAVWVVB6HYPSTY3YNQJVM2C4FZRWBSG");
                MDX testMdx = new MDX(
                        "{\"digest\":\"sha1:SKAVWVVB6HYPSTY3YNQJVM2C4FZRWBSG\",\"url\":\"http://data.gov.uk/\",\"timestamp\":\"20140325121238\"}");
                assertEquals(testKey, result.get(0).getFirst());
                assertEquals(testMdx.getUrl(), mdx.getUrl());
                assertEquals(testMdx.getHash(), mdx.getHash());
                assertEquals(testMdx.getTs(), mdx.getTs());
            }

        }
        mapDriver.resetOutput();
    }
}

From source file:uk.bl.wa.hadoop.mapred.FrequencyCountingReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterator<Text> iterator, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {

    log.warn("Reducing for key: " + key);

    // Use a simple set to collect only distinct results for this key:
    Map<String, MutableInt> matches = new HashMap<String, MutableInt>();
    while (iterator.hasNext()) {
        String m = iterator.next().toString();
        // Get or set up the counter:
        MutableInt value = matches.get(m);
        if (value == null) {
            value = new MutableInt();
            matches.put(m, value);//from  ww w.  ja va 2 s  . c o  m
        }
        // Increment the counter for this match:
        value.inc();
    }

    // Loop through and collect all distinct matches:
    Text result = new Text();
    Text outKey = key;
    OutputCollector<Text, Text> collector;
    int pos = key.find("__");
    if (pos == -1) {
        collector = output;
    } else {
        String[] fp = key.toString().split("__");
        collector = mos.getCollector(fp[0], fp[1], reporter);
        outKey = new Text(fp[1]);
    }
    log.info("For key: " + key + " outKey " + outKey + " outputting " + matches.size() + " unique values.");
    for (String match : matches.keySet()) {
        // This outputs the count:
        result.set(match + "\t" + matches.get(match).get());
        // And collect:
        collector.collect(outKey, result);
    }

}

From source file:uk.bl.wa.hadoop.mapreduce.cdx.TinyCDXServerMapper.java

License:Open Source License

/**
 * /*from w  w w  .  java  2  s.  co  m*/
 * @param args
 * @throws Exception
 */
public static void main(String[] args) throws Exception {

    File inputFile = new File(args[0]);
    ArchiveReader arcreader = ArchiveReaderFactory.get(inputFile);
    arcreader.setStrict(false);
    WarcIndexer warcIndexer = new WarcIndexer();
    ArcIndexer arcIndexer = new ArcIndexer();
    Iterator<CaptureSearchResult> archiveIterator;
    if (inputFile.getName().matches("^.+\\.warc(\\.gz)?$")) {
        archiveIterator = warcIndexer.iterator((WARCReader) arcreader);
    } else {
        archiveIterator = arcIndexer.iterator((ARCReader) arcreader);
    }
    Iterator<String> cdxlines = SearchResultToCDXFormatAdapter.adapt(archiveIterator,
            new CDXFormat(DereferencingArchiveToCDXRecordReader.CDX_11));

    // Test it:
    TinyCDXServerMapper mapper = new TinyCDXServerMapper();
    mapper.tcs = new TinyCDXSender("http://localhost:9090/t3", 20);
    Text cdxline = new Text();
    while (cdxlines.hasNext()) {
        cdxline.set(cdxlines.next());
        mapper.map(cdxline, cdxline, null);
    }
    mapper.tcs.close();

}

From source file:weka.distributed.hadoop.CorrelationMatrixHadoopMapper.java

License:Open Source License

@Override
public void cleanup(Context context) throws IOException, InterruptedException {

    // output all the rows in this partial matrix
    double[][] partialMatrix = m_task.getMatrix();
    int[][] coOcc = m_task.getCoOccurrenceCounts();

    for (int i = 0; i < partialMatrix.length; i++) {
        double[] row = partialMatrix[i];
        int[] co = null;
        if (coOcc != null) {
            co = coOcc[i];/*  w w  w .ja  v a 2  s.  c  om*/
        }
        MatrixRowHolder rh = new MatrixRowHolder(i, row, co);
        byte[] bytes = rowHolderToBytes(rh);

        String sKey = ("" + i);
        Text key = new Text();
        key.set(sKey);

        BytesWritable value = new BytesWritable();
        value.set(bytes, 0, bytes.length);

        context.write(key, value);
    }
}

From source file:weka.distributed.hadoop.CorrelationMatrixRowHadoopReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException {
    List<MatrixRowHolder> rowsToAgg = new ArrayList<MatrixRowHolder>();

    try {/*  w w w  .  j a  v a  2s  .c  o m*/
        for (BytesWritable b : values) {
            byte[] bytes = b.getBytes();

            rowsToAgg.add(deserialize(bytes));
        }
    } catch (ClassNotFoundException ex) {
        throw new IOException(ex);
    }

    if (rowsToAgg.size() > 0) {

        int rowNum = rowsToAgg.get(0).getRowNumber();

        List<double[]> rows = new ArrayList<double[]>();
        List<int[]> coOcc = null;
        if (!m_missingsWereReplacedWithMeans) {
            coOcc = new ArrayList<int[]>();
        }

        for (MatrixRowHolder r : rowsToAgg) {
            if (r.getRowNumber() != rowNum) {
                throw new IOException("Matrix row numbers for this key appear to differ!");
            }
            rows.add(r.getRow());
            if (!m_missingsWereReplacedWithMeans) {
                coOcc.add(r.getCoOccurrencesCounts());
            }
        }
        try {
            double[] aggregated = m_task.aggregate(rowsToAgg.get(0).getRowNumber(), rows, coOcc,
                    m_headerWithSummaryAtts, m_missingsWereReplacedWithMeans, m_covariance, m_deleteClassIfSet);

            // assemble Text key (row num) and Text row (space separated
            // values)

            Text outKey = new Text();
            outKey.set("" + rowNum);

            StringBuilder b = new StringBuilder();
            for (int i = 0; i < aggregated.length; i++) {
                if (i < aggregated.length - 1) {
                    b.append("" + aggregated[i]).append(" ");
                } else {
                    b.append("" + aggregated[i]);
                }
            }

            Text outVal = new Text();
            outVal.set(b.toString());
            context.write(outKey, outVal);
        } catch (DistributedWekaException e) {
            throw new IOException(e);
        } catch (InterruptedException e) {
            throw new IOException(e);
        }
    }
}

From source file:weka.distributed.hadoop.CSVToArffHeaderHadoopMapper.java

License:Open Source License

@Override
public void cleanup(Context context) throws IOException, InterruptedException {

    if (m_fatalMappingError != null) {
        throw m_fatalMappingError;
    }//  ww  w  .j  av a  2s . co  m

    HeaderAndQuantileDataHolder holder = null;
    Instances header = null;
    if (!m_estimateQuantiles) {
        header = m_task.getHeader();
    } else {
        try {
            holder = m_task.getHeaderAndQuantileEstimators();
        } catch (DistributedWekaException ex) {
            throw new IOException(ex);
        }
    }

    ByteArrayOutputStream ostream = new ByteArrayOutputStream();
    OutputStream os = ostream;
    ObjectOutputStream p;

    p = new ObjectOutputStream(new BufferedOutputStream(new GZIPOutputStream(os)));
    p.writeObject(header != null ? header : holder);
    p.flush();
    p.close();

    byte[] bytes = ostream.toByteArray();
    // make sure all headers go to the same reducer
    String contantKey = "header";

    Text key = new Text();
    key.set(contantKey);
    BytesWritable value = new BytesWritable();
    value.set(bytes, 0, bytes.length);
    context.write(key, value); // write the header
}

From source file:weka.distributed.hadoop.CSVToArffHeaderHadoopReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException {
    Configuration conf = context.getConfiguration();
    String outputDestination = conf.get(CSV_TO_ARFF_HEADER_WRITE_PATH);

    if (outputDestination == null || outputDestination.length() == 0) {
        throw new IOException("No destination given for aggregated ARFF header");
    }//from w ww  . j  a  va2s. c  om

    List<Instances> headersToAgg = new ArrayList<Instances>();
    List<HeaderAndQuantileDataHolder> holdersToAgg = new ArrayList<HeaderAndQuantileDataHolder>();

    int counter = 0;
    try {
        for (BytesWritable b : values) {
            byte[] bytes = b.getBytes();
            if (m_estimateQuantiles) {
                HeaderAndQuantileDataHolder holder = deserializeHolder(bytes);
                holdersToAgg.add(holder);
            } else {
                Instances aHeader = deserializeHeader(bytes);
                headersToAgg.add(aHeader);
            }
            counter++;
        }
    } catch (Exception ex) {
        throw new IOException(ex);
    }

    try {
        Instances aggregated = m_estimateQuantiles ? m_task.aggregateHeadersAndQuartiles(holdersToAgg)
                : m_task.aggregate(headersToAgg);
        writeHeaderToDestination(aggregated, outputDestination, conf);

        Text outkey = new Text();
        outkey.set("AKey");
        Text outval = new Text();
        outval.set("Num headers aggregated " + counter);
        context.write(outkey, outval);

    } catch (Exception e) {
        throw new IOException(e);
    }
}

From source file:weka.distributed.hadoop.KMeansCentroidSketchHadoopMapper.java

License:Open Source License

@Override
public void cleanup(Context context) throws IOException, InterruptedException {
    // emit serialized sketch tasks with run number as key
    for (int i = 0; i < m_tasks.length; i++) {
        System.err.println("Number of instances in sketch: " + m_tasks[i].getCurrentSketch().numInstances());
        System.err.println(//from www  . j a  va2 s .c o m
                "Number of instances in reservoir: " + m_tasks[i].getReservoirSample().getSample().size());
        byte[] bytes = sketchToBytes(m_tasks[i]);
        String runNum = "run" + i;
        Text key = new Text();
        key.set(runNum);
        BytesWritable value = new BytesWritable();
        value.set(bytes, 0, bytes.length);
        context.write(key, value);
    }
}

From source file:weka.distributed.hadoop.KMeansCentroidSketchHadoopReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException {

    int runNum = 0;
    String rS = key.toString();/*  w  w w  .j  a v  a2s  .  com*/
    rS = rS.replace("run", "");
    try {
        runNum = Integer.parseInt(rS);
    } catch (NumberFormatException ex) {
        throw new IOException(ex);
    }
    CentroidSketch initial = null;

    List<NormalizableDistance> distsForRun = new ArrayList<NormalizableDistance>();
    try {
        for (BytesWritable b : values) {
            byte[] bytes = b.getBytes();

            CentroidSketch current = deserialize(bytes);
            if (initial == null) {
                initial = current;
            } else {
                initial.aggregateReservoir(current.getReservoirSample());
            }

            if (m_isFirstIteration) {
                distsForRun.add(current.getDistanceFunction());
            }
        }

        // add the reservoir to the current sketch
        initial.addReservoirToCurrentSketch();

        // update the distance function with global numeric
        // attribute ranges
        if (m_isFirstIteration) {
            Instances distancePrimingData = KMeansReduceTask
                    .computeDistancePrimingDataFromDistanceFunctions(distsForRun, m_transformedHeaderNoSummary);

            initial.getDistanceFunction().setInstances(distancePrimingData);
        }

        // save the sketch out
        writeSketchToDestination(initial, m_outputDestination, runNum, context.getConfiguration());

        System.err.println("Number of instances in sketch for run " + runNum + ": "
                + initial.getCurrentSketch().numInstances());
        Text outKey = new Text();
        outKey.set("Summary:\n");
        Text outVal = new Text();
        outVal.set("Number of instances in sketch for run " + runNum + ": "
                + initial.getCurrentSketch().numInstances());

        context.write(outKey, outVal);
    } catch (Exception ex) {
        throw new IOException(ex);
    }
}