List of usage examples for org.apache.hadoop.io Text set
public void set(Text other)
From source file:uk.bl.wa.hadoop.ArchiveFileRecordReader.java
License:Open Source License
@Override public boolean next(Text key, WritableArchiveRecord value) throws IOException { boolean found = false; while (!found) { boolean hasNext = false; try {/*from w w w. j a v a 2 s . c om*/ hasNext = iterator.hasNext(); } catch (Throwable e) { log.error("ERROR in hasNext(): " + this.archiveName + ": " + e.toString()); hasNext = false; } try { if (hasNext) { record = (ArchiveRecord) iterator.next(); found = true; key.set(this.archiveName); value.setRecord(record); } else if (!this.nextFile()) { break; } } catch (Throwable e) { found = false; log.error("ERROR reading " + this.archiveName, e); // Reached the end of the file? If so move on or exit: if (e.getCause() instanceof EOFException) { log.error("EOF while reading " + this.archiveName); if (!this.nextFile()) { break; } } } } return found; }
From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXMapperTest.java
License:Open Source License
@Test public void testMapper() throws IOException, JSONException { Set<String> skippableRecords = new HashSet<String>(); skippableRecords.add("application/warc-fields"); skippableRecords.add("text/dns"); File inputFile = new File( "../warc-indexer/src/test/resources/gov.uk-revisit-warcs/BL-20140325121225068-00000-32090~opera~8443.warc.gz"); String archiveName = inputFile.getName(); ArchiveReader reader = ArchiveReaderFactory.get(inputFile); Iterator<ArchiveRecord> ir = reader.iterator(); ArchiveRecord record;/*w ww . j av a 2s. c o m*/ Text key = new Text(); WritableArchiveRecord value = new WritableArchiveRecord(); while (ir.hasNext()) { record = (ArchiveRecord) ir.next(); key.set(archiveName); value.setRecord(record); LOG.info("GOT: " + record.getHeader().getRecordIdentifier()); LOG.info("GOT: " + record.getHeader().getMimetype()); // Skip records that can't be analysed: if (skippableRecords.contains(record.getHeader().getMimetype())) continue; // Run through them all: LOG.info("Running without testing output..."); mapDriver.setInput(key, value); List<Pair<Text, Text>> result = mapDriver.run(); if (result != null && result.size() > 0) { MDX mdx = new MDX(result.get(0).getSecond().toString()); LOG.info("RESULT MDX: " + mdx); // Perform a specific check for one of the items: if ("http://data.gov.uk/".equals(Normalisation.sanitiseWARCHeaderValue(record.getHeader().getUrl())) && record.getHeader().getMimetype().contains("response")) { Text testKey = new Text("sha1:SKAVWVVB6HYPSTY3YNQJVM2C4FZRWBSG"); MDX testMdx = new MDX( "{\"digest\":\"sha1:SKAVWVVB6HYPSTY3YNQJVM2C4FZRWBSG\",\"url\":\"http://data.gov.uk/\",\"timestamp\":\"20140325121238\"}"); assertEquals(testKey, result.get(0).getFirst()); assertEquals(testMdx.getUrl(), mdx.getUrl()); assertEquals(testMdx.getHash(), mdx.getHash()); assertEquals(testMdx.getTs(), mdx.getTs()); } } mapDriver.resetOutput(); } }
From source file:uk.bl.wa.hadoop.mapred.FrequencyCountingReducer.java
License:Open Source License
@Override public void reduce(Text key, Iterator<Text> iterator, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { log.warn("Reducing for key: " + key); // Use a simple set to collect only distinct results for this key: Map<String, MutableInt> matches = new HashMap<String, MutableInt>(); while (iterator.hasNext()) { String m = iterator.next().toString(); // Get or set up the counter: MutableInt value = matches.get(m); if (value == null) { value = new MutableInt(); matches.put(m, value);//from ww w. ja va 2 s . c o m } // Increment the counter for this match: value.inc(); } // Loop through and collect all distinct matches: Text result = new Text(); Text outKey = key; OutputCollector<Text, Text> collector; int pos = key.find("__"); if (pos == -1) { collector = output; } else { String[] fp = key.toString().split("__"); collector = mos.getCollector(fp[0], fp[1], reporter); outKey = new Text(fp[1]); } log.info("For key: " + key + " outKey " + outKey + " outputting " + matches.size() + " unique values."); for (String match : matches.keySet()) { // This outputs the count: result.set(match + "\t" + matches.get(match).get()); // And collect: collector.collect(outKey, result); } }
From source file:uk.bl.wa.hadoop.mapreduce.cdx.TinyCDXServerMapper.java
License:Open Source License
/** * /*from w w w . java 2 s. co m*/ * @param args * @throws Exception */ public static void main(String[] args) throws Exception { File inputFile = new File(args[0]); ArchiveReader arcreader = ArchiveReaderFactory.get(inputFile); arcreader.setStrict(false); WarcIndexer warcIndexer = new WarcIndexer(); ArcIndexer arcIndexer = new ArcIndexer(); Iterator<CaptureSearchResult> archiveIterator; if (inputFile.getName().matches("^.+\\.warc(\\.gz)?$")) { archiveIterator = warcIndexer.iterator((WARCReader) arcreader); } else { archiveIterator = arcIndexer.iterator((ARCReader) arcreader); } Iterator<String> cdxlines = SearchResultToCDXFormatAdapter.adapt(archiveIterator, new CDXFormat(DereferencingArchiveToCDXRecordReader.CDX_11)); // Test it: TinyCDXServerMapper mapper = new TinyCDXServerMapper(); mapper.tcs = new TinyCDXSender("http://localhost:9090/t3", 20); Text cdxline = new Text(); while (cdxlines.hasNext()) { cdxline.set(cdxlines.next()); mapper.map(cdxline, cdxline, null); } mapper.tcs.close(); }
From source file:weka.distributed.hadoop.CorrelationMatrixHadoopMapper.java
License:Open Source License
@Override public void cleanup(Context context) throws IOException, InterruptedException { // output all the rows in this partial matrix double[][] partialMatrix = m_task.getMatrix(); int[][] coOcc = m_task.getCoOccurrenceCounts(); for (int i = 0; i < partialMatrix.length; i++) { double[] row = partialMatrix[i]; int[] co = null; if (coOcc != null) { co = coOcc[i];/* w w w .ja v a 2 s. c om*/ } MatrixRowHolder rh = new MatrixRowHolder(i, row, co); byte[] bytes = rowHolderToBytes(rh); String sKey = ("" + i); Text key = new Text(); key.set(sKey); BytesWritable value = new BytesWritable(); value.set(bytes, 0, bytes.length); context.write(key, value); } }
From source file:weka.distributed.hadoop.CorrelationMatrixRowHadoopReducer.java
License:Open Source License
@Override public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException { List<MatrixRowHolder> rowsToAgg = new ArrayList<MatrixRowHolder>(); try {/* w w w . j a v a 2s .c o m*/ for (BytesWritable b : values) { byte[] bytes = b.getBytes(); rowsToAgg.add(deserialize(bytes)); } } catch (ClassNotFoundException ex) { throw new IOException(ex); } if (rowsToAgg.size() > 0) { int rowNum = rowsToAgg.get(0).getRowNumber(); List<double[]> rows = new ArrayList<double[]>(); List<int[]> coOcc = null; if (!m_missingsWereReplacedWithMeans) { coOcc = new ArrayList<int[]>(); } for (MatrixRowHolder r : rowsToAgg) { if (r.getRowNumber() != rowNum) { throw new IOException("Matrix row numbers for this key appear to differ!"); } rows.add(r.getRow()); if (!m_missingsWereReplacedWithMeans) { coOcc.add(r.getCoOccurrencesCounts()); } } try { double[] aggregated = m_task.aggregate(rowsToAgg.get(0).getRowNumber(), rows, coOcc, m_headerWithSummaryAtts, m_missingsWereReplacedWithMeans, m_covariance, m_deleteClassIfSet); // assemble Text key (row num) and Text row (space separated // values) Text outKey = new Text(); outKey.set("" + rowNum); StringBuilder b = new StringBuilder(); for (int i = 0; i < aggregated.length; i++) { if (i < aggregated.length - 1) { b.append("" + aggregated[i]).append(" "); } else { b.append("" + aggregated[i]); } } Text outVal = new Text(); outVal.set(b.toString()); context.write(outKey, outVal); } catch (DistributedWekaException e) { throw new IOException(e); } catch (InterruptedException e) { throw new IOException(e); } } }
From source file:weka.distributed.hadoop.CSVToArffHeaderHadoopMapper.java
License:Open Source License
@Override public void cleanup(Context context) throws IOException, InterruptedException { if (m_fatalMappingError != null) { throw m_fatalMappingError; }// ww w .j av a 2s . co m HeaderAndQuantileDataHolder holder = null; Instances header = null; if (!m_estimateQuantiles) { header = m_task.getHeader(); } else { try { holder = m_task.getHeaderAndQuantileEstimators(); } catch (DistributedWekaException ex) { throw new IOException(ex); } } ByteArrayOutputStream ostream = new ByteArrayOutputStream(); OutputStream os = ostream; ObjectOutputStream p; p = new ObjectOutputStream(new BufferedOutputStream(new GZIPOutputStream(os))); p.writeObject(header != null ? header : holder); p.flush(); p.close(); byte[] bytes = ostream.toByteArray(); // make sure all headers go to the same reducer String contantKey = "header"; Text key = new Text(); key.set(contantKey); BytesWritable value = new BytesWritable(); value.set(bytes, 0, bytes.length); context.write(key, value); // write the header }
From source file:weka.distributed.hadoop.CSVToArffHeaderHadoopReducer.java
License:Open Source License
@Override public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException { Configuration conf = context.getConfiguration(); String outputDestination = conf.get(CSV_TO_ARFF_HEADER_WRITE_PATH); if (outputDestination == null || outputDestination.length() == 0) { throw new IOException("No destination given for aggregated ARFF header"); }//from w ww . j a va2s. c om List<Instances> headersToAgg = new ArrayList<Instances>(); List<HeaderAndQuantileDataHolder> holdersToAgg = new ArrayList<HeaderAndQuantileDataHolder>(); int counter = 0; try { for (BytesWritable b : values) { byte[] bytes = b.getBytes(); if (m_estimateQuantiles) { HeaderAndQuantileDataHolder holder = deserializeHolder(bytes); holdersToAgg.add(holder); } else { Instances aHeader = deserializeHeader(bytes); headersToAgg.add(aHeader); } counter++; } } catch (Exception ex) { throw new IOException(ex); } try { Instances aggregated = m_estimateQuantiles ? m_task.aggregateHeadersAndQuartiles(holdersToAgg) : m_task.aggregate(headersToAgg); writeHeaderToDestination(aggregated, outputDestination, conf); Text outkey = new Text(); outkey.set("AKey"); Text outval = new Text(); outval.set("Num headers aggregated " + counter); context.write(outkey, outval); } catch (Exception e) { throw new IOException(e); } }
From source file:weka.distributed.hadoop.KMeansCentroidSketchHadoopMapper.java
License:Open Source License
@Override public void cleanup(Context context) throws IOException, InterruptedException { // emit serialized sketch tasks with run number as key for (int i = 0; i < m_tasks.length; i++) { System.err.println("Number of instances in sketch: " + m_tasks[i].getCurrentSketch().numInstances()); System.err.println(//from www . j a va2 s .c o m "Number of instances in reservoir: " + m_tasks[i].getReservoirSample().getSample().size()); byte[] bytes = sketchToBytes(m_tasks[i]); String runNum = "run" + i; Text key = new Text(); key.set(runNum); BytesWritable value = new BytesWritable(); value.set(bytes, 0, bytes.length); context.write(key, value); } }
From source file:weka.distributed.hadoop.KMeansCentroidSketchHadoopReducer.java
License:Open Source License
@Override public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException { int runNum = 0; String rS = key.toString();/* w w w .j a v a2s . com*/ rS = rS.replace("run", ""); try { runNum = Integer.parseInt(rS); } catch (NumberFormatException ex) { throw new IOException(ex); } CentroidSketch initial = null; List<NormalizableDistance> distsForRun = new ArrayList<NormalizableDistance>(); try { for (BytesWritable b : values) { byte[] bytes = b.getBytes(); CentroidSketch current = deserialize(bytes); if (initial == null) { initial = current; } else { initial.aggregateReservoir(current.getReservoirSample()); } if (m_isFirstIteration) { distsForRun.add(current.getDistanceFunction()); } } // add the reservoir to the current sketch initial.addReservoirToCurrentSketch(); // update the distance function with global numeric // attribute ranges if (m_isFirstIteration) { Instances distancePrimingData = KMeansReduceTask .computeDistancePrimingDataFromDistanceFunctions(distsForRun, m_transformedHeaderNoSummary); initial.getDistanceFunction().setInstances(distancePrimingData); } // save the sketch out writeSketchToDestination(initial, m_outputDestination, runNum, context.getConfiguration()); System.err.println("Number of instances in sketch for run " + runNum + ": " + initial.getCurrentSketch().numInstances()); Text outKey = new Text(); outKey.set("Summary:\n"); Text outVal = new Text(); outVal.set("Number of instances in sketch for run " + runNum + ": " + initial.getCurrentSketch().numInstances()); context.write(outKey, outVal); } catch (Exception ex) { throw new IOException(ex); } }