List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:com.elex.dmp.vectorizer.TFPartialVectorReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Iterator<StringTuple> it = values.iterator(); if (!it.hasNext()) { return;/* w w w . j a v a2s. c o m*/ } StringTuple value = it.next(); Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size if (maxNGramSize >= 2) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); try { do { String term = sf.getAttribute(CharTermAttribute.class).toString(); if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } while (sf.incrementToken()); sf.end(); } finally { Closeables.closeQuietly(sf); } } else { for (String term : value.getEntries()) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } } if (sequentialAccess) { vector = new SequentialAccessSparseVector(vector); } if (namedVector) { vector = new NamedVector(vector, key.toString()); } // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk. if (vector.getNumNondefaultElements() > 0) { VectorWritable vectorWritable = new VectorWritable(vector); context.write(key, vectorWritable); } else { context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1); } }
From source file:com.ema.hadoop.bestclient.BCMapper.java
@Override public void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { DateFormat formatter = new SimpleDateFormat("dd/MM/yyyy"); try {/*ww w .j a va2s. c o m*/ String line = value.toString(); String[] lineTab = line.split(";"); String client = lineTab[0]; int somme = Integer.parseInt(lineTab[2]); Date date = formatter.parse(lineTab[1]); JobConf jobConf = (JobConf) context.getConfiguration(); String[] dateTable = jobConf.getStrings("dates"); Date dateStart = formatter.parse(dateTable[0]); Date dateEnd = formatter.parse(dateTable[1]); if (date.after(dateStart) && date.before(dateEnd)) { context.write(new Text(client), new IntWritable(somme)); } else { Logger.getLogger(BCMapper.class.getName()).log(Level.INFO, "ELSE ddddddddddddddddddddd"); Logger.getLogger(BCMapper.class.getName()).log(Level.INFO, "param start " + dateTable[0]); Logger.getLogger(BCMapper.class.getName()).log(Level.INFO, "Date start " + dateStart.toString()); Logger.getLogger(BCMapper.class.getName()).log(Level.INFO, "param fin " + dateTable[1]); Logger.getLogger(BCMapper.class.getName()).log(Level.INFO, "Date fin " + dateEnd.toString()); } } catch (ParseException e) { Logger.getLogger(BCMapper.class.getName()).log(Level.INFO, "Parse exception"); } }
From source file:com.ema.hadoop.wordcount.WCMapper.java
@Override public void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { context.write(new Text(tokenizer.nextToken()), new IntWritable(1)); }/* w w w .j av a 2 s . c om*/ }
From source file:com.endgame.binarypig.loaders.av.ClamScanDaemonLoader.java
License:Apache License
@Override public Tuple processFile(Text key, BytesWritable value, File binaryFile) throws IOException { boolean timedOut = false; Socket sock = null;/*w w w . jav a2s . c o m*/ BufferedReader in = null; OutputStream out = null; String result = ""; try { sock = new Socket(); if (getTimeoutMS() < (long) Integer.MAX_VALUE) { sock.setSoTimeout((int) getTimeoutMS()); } sock.connect(clamdEndoint); out = sock.getOutputStream(); in = new BufferedReader(new InputStreamReader(sock.getInputStream())); out.write(("nSCAN " + binaryFile.getAbsolutePath() + "\n").getBytes()); String data = in.readLine(); if (data != null) { result = data.substring(data.indexOf(':') + 1). // "remove the /path/to/file: " replace(" FOUND", ""). // no need for the "FOUND" string replaceAll("\\([a-f0-9:]+\\)$", "").trim(); // on some versions of clamscan, it adds (MD5:NUM) } } catch (SocketTimeoutException e) { result = ""; timedOut = true; } finally { IOUtils.closeSocket(sock); IOUtils.closeStream(in); IOUtils.closeStream(out); } getProtoTuple().clear(); getProtoTuple().add(key.toString()); getProtoTuple().add(timedOut); getProtoTuple().add(result); return getTupleFactory().newTuple(getProtoTuple()); }
From source file:com.endgame.binarypig.loaders.ExecutingJsonLoader.java
License:Apache License
public Tuple outputToTuple(Text key, BytesWritable value, String output, boolean timedOut) { protoTuple.clear();//from w ww.java2 s . c o m protoTuple.add(key.toString()); protoTuple.add(timedOut); // when adding, might want to consider doing explicit casts from Writables to Pig datatypes - does not appear to be needed at this time //This is the spot to do the generic JSON loading. some override function for data formatting would be here try { protoTuple.add(jsonUtil.wrap(jsonParser.parse(output))); } catch (Exception e) { protoTuple.add(new NonSpillableDataBag()); } return tupleFactory.newTuple(protoTuple); }
From source file:com.endgame.binarypig.loaders.ExecutingTextLoader.java
License:Apache License
public Tuple outputToTuple(Text key, BytesWritable value, String output, boolean timedOut) { protoTuple.clear();/* w w w. j av a 2 s .c o m*/ protoTuple.add(key.toString()); protoTuple.add(timedOut); try { protoTuple.add(output); } catch (Exception e) { protoTuple.add(""); } return tupleFactory.newTuple(protoTuple); }
From source file:com.endgame.binarypig.loaders.pehash.HashingLoader.java
License:Apache License
@Override public Tuple outputToTuple(Text key, BytesWritable value, String output, boolean timedOut) { protoTuple.clear();/*from w w w . j ava2 s. co m*/ protoTuple.add(key.toString()); protoTuple.add(timedOut); // when adding, might want to consider doing explicit casts from Writables to Pig datatypes - does not appear to be needed at this time //This is the spot to do the generic JSON loading. some override function for data formatting would be here try { Map<String, String> events = gson.fromJson(output, new TypeToken<Map<String, String>>() { }.getType()); protoTuple.add(events.get("md5")); protoTuple.add(events.get("sha1")); protoTuple.add(events.get("sha256")); protoTuple.add(events.get("sha512")); protoTuple.add(events.get("pe_hash")); } catch (Exception e) { protoTuple.add(""); protoTuple.add(""); protoTuple.add(""); protoTuple.add(""); protoTuple.add(""); } return tupleFactory.newTuple(protoTuple); }
From source file:com.endgame.binarypig.loaders.TextDaemonLoader.java
License:Apache License
@Override public Tuple processFile(Text key, BytesWritable value, File binaryFile) throws IOException { boolean timedOut = false; String result = ""; try {//from w w w . j a v a 2s . co m out.write((binaryFile.getAbsolutePath() + "\n").getBytes()); String data = in.readLine(); if (data != null) { result = data; } } catch (SocketTimeoutException e) { result = ""; timedOut = true; } catch (SocketException e) { System.err.println("WARN: Exception occurred, attempting to re-connect..."); e.printStackTrace(); close(); init(); out.write((binaryFile.getAbsolutePath() + "\n").getBytes()); String data = in.readLine(); if (data != null) { result = data; } } getProtoTuple().clear(); getProtoTuple().add(key.toString()); getProtoTuple().add(timedOut); getProtoTuple().add(result); return getTupleFactory().newTuple(getProtoTuple()); }
From source file:com.example.wordcount.operator.WordCountOperator.java
License:Apache License
@CoGroup public void split(@Key(group = {}) List<LogLine> lines, Result<WordCount> result) { for (LogLine line : lines) { Text query = line.getQuery(); if (query.toString().length() == 0) { continue; }/*from w w w .ja va 2 s .co m*/ for (String word : query.toString().split(SPACE)) { WordCount count = new WordCount(); count.setWordAsString(word); count.setCount(1); result.add(count); } } }
From source file:com.examples.ch03.CLFMapper_Ex_1.java
@Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String entry = value.toString(); System.out.println(entry);/* ww w .jav a2 s . co m*/ Matcher m = p.matcher(entry); if (!m.matches()) { return; } Date date = null; try { date = dateFormatter.parse(m.group(4)); } catch (ParseException ex) { return; } outputKey.set(m.group(1)); //ip StringBuilder b = new StringBuilder(); b.append(date.getTime()); //timestamp b.append('\t'); b.append(m.group(6)); //page b.append('\t'); b.append(m.group(8)); //http status b.append('\t'); b.append(m.group(9)); //bytes b.append('\t'); b.append(m.group(12)); //useragent outputValue.set(b.toString()); context.write(outputKey, outputValue); }