Example usage for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString()

Source Link

Document

Convert text back to string

Usage

From source file:com.elex.dmp.vectorizer.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;/* w w w .  j a  v a2s.  c o  m*/
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.closeQuietly(sf);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1);
    }
}

From source file:com.ema.hadoop.bestclient.BCMapper.java

@Override
public void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException {
    DateFormat formatter = new SimpleDateFormat("dd/MM/yyyy");

    try {/*ww w .j  a  va2s. c o  m*/
        String line = value.toString();
        String[] lineTab = line.split(";");

        String client = lineTab[0];
        int somme = Integer.parseInt(lineTab[2]);
        Date date = formatter.parse(lineTab[1]);

        JobConf jobConf = (JobConf) context.getConfiguration();
        String[] dateTable = jobConf.getStrings("dates");

        Date dateStart = formatter.parse(dateTable[0]);
        Date dateEnd = formatter.parse(dateTable[1]);

        if (date.after(dateStart) && date.before(dateEnd)) {
            context.write(new Text(client), new IntWritable(somme));
        } else {
            Logger.getLogger(BCMapper.class.getName()).log(Level.INFO, "ELSE ddddddddddddddddddddd");
            Logger.getLogger(BCMapper.class.getName()).log(Level.INFO, "param start " + dateTable[0]);
            Logger.getLogger(BCMapper.class.getName()).log(Level.INFO, "Date start " + dateStart.toString());
            Logger.getLogger(BCMapper.class.getName()).log(Level.INFO, "param fin " + dateTable[1]);
            Logger.getLogger(BCMapper.class.getName()).log(Level.INFO, "Date fin " + dateEnd.toString());
        }

    } catch (ParseException e) {
        Logger.getLogger(BCMapper.class.getName()).log(Level.INFO, "Parse exception");
    }

}

From source file:com.ema.hadoop.wordcount.WCMapper.java

@Override
public void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException {
    String line = value.toString();
    StringTokenizer tokenizer = new StringTokenizer(line);

    while (tokenizer.hasMoreTokens()) {
        context.write(new Text(tokenizer.nextToken()), new IntWritable(1));
    }/* w  w  w .j  av  a  2  s  .  c  om*/
}

From source file:com.endgame.binarypig.loaders.av.ClamScanDaemonLoader.java

License:Apache License

@Override
public Tuple processFile(Text key, BytesWritable value, File binaryFile) throws IOException {
    boolean timedOut = false;
    Socket sock = null;/*w w  w . jav a2s . c  o  m*/
    BufferedReader in = null;
    OutputStream out = null;
    String result = "";
    try {
        sock = new Socket();
        if (getTimeoutMS() < (long) Integer.MAX_VALUE) {
            sock.setSoTimeout((int) getTimeoutMS());
        }

        sock.connect(clamdEndoint);

        out = sock.getOutputStream();
        in = new BufferedReader(new InputStreamReader(sock.getInputStream()));

        out.write(("nSCAN " + binaryFile.getAbsolutePath() + "\n").getBytes());
        String data = in.readLine();
        if (data != null) {
            result = data.substring(data.indexOf(':') + 1). // "remove the /path/to/file: "
                    replace(" FOUND", ""). // no need for the "FOUND" string
                    replaceAll("\\([a-f0-9:]+\\)$", "").trim(); // on some versions of clamscan, it adds (MD5:NUM)
        }
    } catch (SocketTimeoutException e) {
        result = "";
        timedOut = true;
    } finally {
        IOUtils.closeSocket(sock);
        IOUtils.closeStream(in);
        IOUtils.closeStream(out);
    }

    getProtoTuple().clear();
    getProtoTuple().add(key.toString());
    getProtoTuple().add(timedOut);
    getProtoTuple().add(result);
    return getTupleFactory().newTuple(getProtoTuple());
}

From source file:com.endgame.binarypig.loaders.ExecutingJsonLoader.java

License:Apache License

public Tuple outputToTuple(Text key, BytesWritable value, String output, boolean timedOut) {
    protoTuple.clear();//from   w ww.java2 s  .  c o m
    protoTuple.add(key.toString());
    protoTuple.add(timedOut);

    // when adding, might want to consider doing explicit casts from Writables to Pig datatypes - does not appear to be needed at this time
    //This is the spot to do the generic JSON loading. some override function for data formatting would be here
    try {
        protoTuple.add(jsonUtil.wrap(jsonParser.parse(output)));
    } catch (Exception e) {
        protoTuple.add(new NonSpillableDataBag());
    }

    return tupleFactory.newTuple(protoTuple);
}

From source file:com.endgame.binarypig.loaders.ExecutingTextLoader.java

License:Apache License

public Tuple outputToTuple(Text key, BytesWritable value, String output, boolean timedOut) {
    protoTuple.clear();/*  w w  w.  j  av  a  2 s .c o  m*/
    protoTuple.add(key.toString());
    protoTuple.add(timedOut);

    try {
        protoTuple.add(output);
    } catch (Exception e) {
        protoTuple.add("");
    }

    return tupleFactory.newTuple(protoTuple);
}

From source file:com.endgame.binarypig.loaders.pehash.HashingLoader.java

License:Apache License

@Override
public Tuple outputToTuple(Text key, BytesWritable value, String output, boolean timedOut) {
    protoTuple.clear();/*from w w w  .  j  ava2 s. co  m*/
    protoTuple.add(key.toString());
    protoTuple.add(timedOut);

    // when adding, might want to consider doing explicit casts from Writables to Pig datatypes - does not appear to be needed at this time
    //This is the spot to do the generic JSON loading. some override function for data formatting would be here
    try {
        Map<String, String> events = gson.fromJson(output, new TypeToken<Map<String, String>>() {
        }.getType());
        protoTuple.add(events.get("md5"));
        protoTuple.add(events.get("sha1"));
        protoTuple.add(events.get("sha256"));
        protoTuple.add(events.get("sha512"));
        protoTuple.add(events.get("pe_hash"));
    } catch (Exception e) {

        protoTuple.add("");
        protoTuple.add("");
        protoTuple.add("");
        protoTuple.add("");
        protoTuple.add("");
    }
    return tupleFactory.newTuple(protoTuple);
}

From source file:com.endgame.binarypig.loaders.TextDaemonLoader.java

License:Apache License

@Override
public Tuple processFile(Text key, BytesWritable value, File binaryFile) throws IOException {
    boolean timedOut = false;
    String result = "";
    try {//from w w  w  .  j  a v  a 2s .  co m
        out.write((binaryFile.getAbsolutePath() + "\n").getBytes());
        String data = in.readLine();
        if (data != null) {
            result = data;
        }
    } catch (SocketTimeoutException e) {
        result = "";
        timedOut = true;
    } catch (SocketException e) {
        System.err.println("WARN: Exception occurred, attempting to re-connect...");
        e.printStackTrace();
        close();
        init();
        out.write((binaryFile.getAbsolutePath() + "\n").getBytes());
        String data = in.readLine();
        if (data != null) {
            result = data;
        }
    }

    getProtoTuple().clear();
    getProtoTuple().add(key.toString());
    getProtoTuple().add(timedOut);
    getProtoTuple().add(result);
    return getTupleFactory().newTuple(getProtoTuple());
}

From source file:com.example.wordcount.operator.WordCountOperator.java

License:Apache License

@CoGroup
public void split(@Key(group = {}) List<LogLine> lines, Result<WordCount> result) {
    for (LogLine line : lines) {
        Text query = line.getQuery();
        if (query.toString().length() == 0) {
            continue;
        }/*from   w w w  .ja va  2  s .co  m*/

        for (String word : query.toString().split(SPACE)) {
            WordCount count = new WordCount();
            count.setWordAsString(word);
            count.setCount(1);
            result.add(count);
        }
    }
}

From source file:com.examples.ch03.CLFMapper_Ex_1.java

@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
    String entry = value.toString();
    System.out.println(entry);/*  ww  w .jav a2  s  .  co m*/
    Matcher m = p.matcher(entry);
    if (!m.matches()) {
        return;
    }
    Date date = null;
    try {
        date = dateFormatter.parse(m.group(4));
    } catch (ParseException ex) {
        return;
    }
    outputKey.set(m.group(1)); //ip

    StringBuilder b = new StringBuilder();
    b.append(date.getTime()); //timestamp
    b.append('\t');
    b.append(m.group(6)); //page
    b.append('\t');
    b.append(m.group(8)); //http status
    b.append('\t');
    b.append(m.group(9)); //bytes
    b.append('\t');
    b.append(m.group(12)); //useragent
    outputValue.set(b.toString());
    context.write(outputKey, outputValue);
}