List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:com.csiro.hadoop.UFORecordValidationMapper.java
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); if (validate(line, context)) { context.write(key, value);//w w w. j a va 2 s. c om } }
From source file:com.dappervision.hbase.mapred.TypedBytesTableReducer.java
License:Apache License
@Override public void reduce(Text key, Iterator<Text> values, OutputCollector<TypedBytesWritable, TypedBytesWritable> outputCollector, Reporter arg3) throws IOException { byte[] keyBytes = key.getBytes(); TypedBytesWritable keyWritable = new TypedBytesWritable(); TypedBytesWritable valueWritable = new TypedBytesWritable(); keyWritable.setValue(new Buffer(keyBytes)); //merge the column family and qualifier HashMap<String, HashMap<String, String>> cfMap = new HashMap<String, HashMap<String, String>>(); while (values.hasNext()) { Text value = values.next(); String strVal = value.toString(); //Separate column family with comma (:) //Separate the qualifier and value with equity String[] cf_qual_val_parts = strVal.split(":"); String cf = cf_qual_val_parts[0]; String qual_val = cf_qual_val_parts[1]; String[] qual_val_parts = qual_val.split("="); String qual = qual_val_parts[0]; String val = qual_val_parts[1]; if (cfMap.get(cf) != null) { HashMap<String, String> qualMap = cfMap.get(cf); if (qualMap == null) { qualMap = new HashMap<String, String>(); }//from ww w. j ava2 s. c o m qualMap.put(qual, val); // the duplicated key will be replaced, if using Buffer, we should do it ourselves } else { HashMap<String, String> qualMap = new HashMap<String, String>(); qualMap.put(qual, val); cfMap.put(cf, qualMap); } } HashMap<Buffer, HashMap<Buffer, Buffer>> bufMap = new HashMap<Buffer, HashMap<Buffer, Buffer>>(); Set<Entry<String, HashMap<String, String>>> entrySet = cfMap.entrySet(); for (Entry<String, HashMap<String, String>> entry : entrySet) { HashMap<String, String> qualValMap = entry.getValue(); HashMap<Buffer, Buffer> qualValBufMap = new HashMap<Buffer, Buffer>(); for (Entry<String, String> qualValEntry : qualValMap.entrySet()) { qualValBufMap.put(new Buffer(qualValEntry.getKey().getBytes()), new Buffer(qualValEntry.getValue().getBytes())); } bufMap.put(new Buffer(entry.getKey().getBytes()), qualValBufMap); } valueWritable.setValue(bufMap); outputCollector.collect(keyWritable, valueWritable); }
From source file:com.dasasian.chok.lucene.integration.LuceneClientTest.java
License:Apache License
@Test public void testFilteredSearch() throws Exception { // write and deploy test index File filterIndex = temporaryFolder.newFolder("filterIndex"); File filterShard = new File(filterIndex, "filterShard"); String textFieldName = "textField"; String filterFieldName = "filterField"; IndexWriter indexWriter = new IndexWriter(FSDirectory.open(filterShard), new StandardAnalyzer(Version.LUCENE_30), true, MaxFieldLength.UNLIMITED); for (int i = 0; i < 100; i++) { Document document = new Document(); document.add(new Field(textFieldName, "sample " + i, Store.YES, Index.NOT_ANALYZED)); document.add(new Field(filterFieldName, "" + (i % 10), Store.YES, Index.NOT_ANALYZED)); indexWriter.addDocument(document); }/*from www.j a v a 2 s. com*/ indexWriter.close(true); DeployClient deployClient = new DeployClient(miniCluster.createInteractionProtocol()); IndexState indexState = deployClient.addIndex(filterIndex.getName(), filterIndex.getAbsolutePath(), 1) .joinDeployment(); assertEquals(IndexState.DEPLOYED, indexState); // build filter for terms in set {i | (i % 10) == 3}. LuceneClient client = new LuceneClient(miniCluster.getZkConfiguration()); TermQuery filterQuery = new TermQuery(new Term(filterFieldName, "3")); QueryWrapperFilter filter = new QueryWrapperFilter(filterQuery); final Query query = new QueryParser(Version.LUCENE_30, "", new KeywordAnalyzer()) .parse(textFieldName + ":" + "sample*3"); final Hits hits = client.search(query, new String[] { filterIndex.getName() }, 100, null, filter); assertNotNull(hits); List<Hit> hitsList = hits.getHits(); for (final Hit hit : hitsList) { writeToLog(hit); } assertEquals(10, hits.size()); assertEquals(10, hitsList.size()); // check that returned results conform to the filter for (final Hit hit : hitsList) { MapWritable mw = client.getDetails(hit); Text text = (Text) mw.get(new Text("textField")); assertNotNull(text); String[] parts = text.toString().split(" "); assertTrue(parts.length == 2); int num = Integer.valueOf(parts[1]); assertTrue((num % 10) == 3); } client.close(); }
From source file:com.dasasian.chok.mapfile.MapFileClient.java
License:Apache License
public List<String> get(final String key, final String[] indexNames) throws ChokException { ClientResult<TextArrayWritable> results = chokClient.broadcastToIndices(TIMEOUT, true, GET_METHOD, GET_METHOD_SHARD_ARG_IDX, indexNames, new Text(key), null); if (results.isError()) { throw results.getChokException(); }//from w w w . ja v a 2s.co m List<String> stringResults = new ArrayList<>(); for (TextArrayWritable taw : results.getResults()) { for (Writable w : taw.array.get()) { Text text = (Text) w; stringResults.add(text.toString()); } } return stringResults; }
From source file:com.dasasian.chok.mapfile.MapFileServerTest.java
License:Apache License
protected String getOneResult(IMapFileServer server, String key, String[] shards) throws Exception { TextArrayWritable texts = server.get(new Text(key), shards); assertNotNull(texts);/* w ww . j a v a2s . c om*/ assertNotNull(texts.array); Writable[] array = texts.array.get(); assertEquals(1, array.length); assertTrue(array[0] instanceof Text); Text text = (Text) array[0]; return text.toString(); }
From source file:com.dataflowdeveloper.detection.URLDetector.java
License:Apache License
/** * UDF Evaluation/*from www . j a v a2s. com*/ * * @param s * Text passed in * @return Text cleaned */ public Text evaluate(final Text s) { if (s == null) { return null; } try { String cleaned = Util.detect(s.toString()); return new Text(cleaned); } catch (Exception e) { e.printStackTrace(); } return null; }
From source file:com.datasalt.pangool.examples.gameoflife.GameOfLifeJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { failArguments("Wrong number of arguments"); return -1; }/* w ww. j av a 2 s . c o m*/ String output = args[0]; String input = GameOfLifeJob.class.getName() + "-prepared-input"; delete(output); delete(input); final int gridSize = Integer.parseInt(args[1]); // Write the input of the job as a set of (min, max) intervals // Each number between (min, max) represents a possible initial configuration for Game of Life int parallelism = Integer.parseInt(args[2]); int maxCombinations = (int) Math.pow(2, gridSize * gridSize); int splitSize = maxCombinations / parallelism; FileSystem fS = FileSystem.get(conf); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fS.create(new Path(input)))); for (int i = 0; i < parallelism; i++) { writer.write(((i * splitSize) + 1) + "\t" + ((i + 1) * splitSize) + "\n"); } writer.close(); // Optional parameters: maxX, maxY, #iterations final int maxX = conf.getInt("gol.max_x", 32); final int maxY = conf.getInt("gol.max_y", 32); final int iterations = conf.getInt("gol.iterations", 1000); Log.info( "using parameters: maxX grid: " + maxX + " maxY grid: " + maxY + " max #iterations: " + iterations); // Define the intermediate schema: a pair of ints final Schema schema = new Schema("minMax", Fields.parse("min:int, max:int")); TupleMRBuilder job = new TupleMRBuilder(conf); job.addIntermediateSchema(schema); job.setGroupByFields("min", "max"); job.setCustomPartitionFields("min"); // Define the input and its associated mapper // The mapper will just emit the (min, max) pairs to the reduce stage job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new TupleMapper<LongWritable, Text>() { Tuple tuple = new Tuple(schema); @Override public void map(LongWritable key, Text value, TupleMRContext context, Collector collector) throws IOException, InterruptedException { String[] fields = value.toString().split("\t"); tuple.set("min", Integer.parseInt(fields[0])); tuple.set("max", Integer.parseInt(fields[1])); collector.write(tuple); } }); // Define the reducer // The reducer will run as many games of life as (max - min) for each interval it receives // It will emit the inputs of GOL that converged together with the number of iterations // Note that inputs that produce grid overflow are ignored (but may have longer iteration convergence) job.setTupleReducer(new TupleReducer<Text, NullWritable>() { public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector) throws IOException, InterruptedException, TupleMRException { int min = (Integer) group.get("min"), max = (Integer) group.get("max"); for (int i = min; i < max; i++) { try { GameOfLife gameOfLife = new GameOfLife(gridSize, GameOfLife.longToBytes((long) i), maxX, maxY, iterations); while (true) { gameOfLife.nextCycle(); } } catch (GameOfLifeException e) { context.getHadoopContext().progress(); context.getHadoopContext().getCounter("stats", e.getCauseMessage() + "").increment(1); if (e.getCauseMessage().equals(CauseMessage.CONVERGENCE_REACHED)) { collector.write(new Text( Arrays.toString(GameOfLife.longToBytes((long) i)) + "\t" + e.getIterations()), NullWritable.get()); } } } }; }); job.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class); try { job.createJob().waitForCompletion(true); } finally { job.cleanUpInstanceFiles(); } delete(input); return 0; }
From source file:com.datasalt.pangool.flow.mapred.TextMapper.java
License:Apache License
@Override public void map(Object keyToIgnore, Text value, TupleMRContext context, Collector collector) throws IOException, InterruptedException { op.process(value.toString(), callback); }
From source file:com.datasalt.pangool.solr.TupleSolrOutputFormatExample.java
License:Apache License
public int run(String input, String output, Configuration conf) throws Exception { // Define the intermediate schema: It must match SOLR's schema.xml! final Schema schema = new Schema("iSchema", Fields.parse("user_id:string, message:string")); TupleMRBuilder job = new TupleMRBuilder(conf); job.addIntermediateSchema(schema);//w ww. ja v a2s .co m job.setGroupByFields("user_id"); // Define the input and its associated mapper. // We'll just have a Mapper, reducer will be Identity job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new TupleMapper<LongWritable, Text>() { Tuple tuple = new Tuple(schema); @Override public void map(LongWritable key, Text value, TupleMRContext context, Collector collector) throws IOException, InterruptedException { String[] fields = value.toString().split("\t"); String language = fields[1]; tuple.set("user_id", fields[0]); tuple.set("message", fields[2]); if (language.equals("en")) { // English -> write to main output collector.write(tuple); } else if (language.equals("fr")) { // French -> write to french index collector.getNamedOutput("FR").write(tuple, NullWritable.get()); } else if (language.equals("es")) { // Spanish -> write to spanish index collector.getNamedOutput("ES").write(tuple, NullWritable.get()); } } }); // Add multi-output: French index job.addNamedOutput("FR", new TupleSolrOutputFormat(new File("src/test/resources/solr-fr"), conf), ITuple.class, NullWritable.class); // Add multi-output: Spanish index job.addNamedOutput("ES", new TupleSolrOutputFormat(new File("src/test/resources/solr-es"), conf), ITuple.class, NullWritable.class); job.setTupleReducer(new IdentityTupleReducer()); // Add multi-output: English index job.setOutput(new Path(output), new TupleSolrOutputFormat(new File("src/test/resources/solr-en"), conf), ITuple.class, NullWritable.class); Job hadoopJob = job.createJob(); try { hadoopJob.waitForCompletion(true); if (!hadoopJob.isSuccessful()) { throw new PangoolRuntimeException("Job was not sucessfull"); } } finally { job.cleanUpInstanceFiles(); } return 0; }
From source file:com.datasalt.pangool.tuplemr.mapred.TestRollup.java
License:Apache License
/** * //from www.j a va 2 s. c o m * Checks that {@link RollupReducer} calls properly {@link TupleReducer#onOpenGroup}, * {@link TupleReducer#onCloseGroup} and {@link TupleReducer#onGroupElements} and checks that the elements (tuples) * passed are coherent. This method assumes an specific output from the {@link TupleReducer}. The output needs to be a * Text,Text for key and value This will be the format used : key("OPEN depth"), value("serialized value") * key("CLOSE depth"), value("serialized value") key("ELEMENT"),value("serialized element") (for every element * received in onElements needs to contain a record like this) * * For instance : key("OPEN 0"), value(" element1") key("OPEN 1"), value("element1 ") key("ELEMENT") , value * ("element1") key("ELEMENT"),value ("element2") key("CLOSE 1"),value ("element2") key("CLOSE 0"),value("element2") * * */ public void checkRollupOutput(Path path, int minDepth, int maxDepth) throws IOException { SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(getConf()), path, getConf()); Text actualKey = new Text(); Text actualValue = new Text(); reader.next(actualKey, actualValue); // first action String currentKey = actualKey.toString(); String currentValue = actualValue.toString(); Assert.assertTrue("First output needs to be an OPEN ", currentKey.startsWith("OPEN")); int currentDepth = Integer.parseInt(currentKey.split(" ")[1]); Assert.assertEquals("First OPEN needs to match minDepth", minDepth, currentDepth); int lastDepth = currentDepth; String lastValue = currentValue; State lastState = State.OPEN; while (reader.next(actualKey, actualValue)) { currentKey = actualKey.toString(); currentValue = actualValue.toString(); if (currentKey.startsWith("OPEN")) { currentDepth = Integer.parseInt(currentKey.split(" ")[1]); Assert.assertEquals("OPEN needs to increase depth in +1 ", lastDepth + 1, currentDepth); Assert.assertTrue("Too many OPENs, over maxDepth ", maxDepth >= currentDepth); if (lastState == State.OPEN) { Assert.assertEquals("First element in OPEN needs to match first element in previous OPEN", lastValue, currentValue); } else if (lastState == State.CLOSE) { Assert.assertNotSame( "Element from new group needs to be different from last element from last group ", lastValue, currentValue); } else { Assert.fail("Not allowed OPEN after ELEMENT"); } lastState = State.OPEN; lastValue = currentValue; lastDepth = currentDepth; } else if (currentKey.startsWith("CLOSE")) { currentDepth = Integer.parseInt(currentKey.split(" ")[1]); Assert.assertNotSame("Not allowed CLOSE after OPEN , needs at least one ELEMENT in between", State.OPEN, lastState); Assert.assertEquals("CLOSE depth needs to match previous OPEN depth", lastDepth, currentDepth); Assert.assertEquals("Element in CLOSE needs to match lastElement in group", lastValue, currentValue); lastState = State.CLOSE; lastValue = currentValue; lastDepth = currentDepth - 1; } else if (currentKey.startsWith("ELEMENT")) { Assert.assertNotSame("Not allowed ELEMENT after CLOSE, needs an OPEN or ELEMENT before", State.CLOSE, lastState); lastState = State.ELEMENT; lastValue = currentValue; } } Assert.assertEquals("File doesn't properly finishes with a CLOSE ", State.CLOSE, lastState); Assert.assertEquals("Last CLOSE doesn't close the minDepth ", minDepth - 1, lastDepth); reader.close(); }