com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.WordCountGraphTokenizer.java Source code

Introduction

Here is the source code for com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.WordCountGraphTokenizer.java
Source

/* Copyright (C) 2012 Intel Corporation.
 *     All rights reserved.
 *           
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 *
 * For more about this software visit:
 *      http://www.01.org/GraphBuilder 
 */
package com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map.Entry;
import java.util.Scanner;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.collections.iterators.EmptyIterator;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.intel.hadoop.graphbuilder.graph.Edge;
import com.intel.hadoop.graphbuilder.graph.Vertex;
import com.intel.hadoop.graphbuilder.preprocess.inputformat.GraphTokenizer;
import com.intel.hadoop.graphbuilder.types.StringType;

public class WordCountGraphTokenizer implements GraphTokenizer<StringType, StringType, StringType> {

    private static final Logger LOG = Logger.getLogger(WordCountGraphTokenizer.class);

    @Override
    public void configure(JobConf job) {
        try {
            fs = FileSystem.get(job);
        } catch (IOException e1) {
            e1.printStackTrace();
        }

        String path = job.get("Dictionary");
        if (path != null) {
            try {
                loadDictionary(path);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    @Override
    public Class vidClass() {
        return StringType.class;
    }

    @Override
    public Class vdataClass() {
        return StringType.class;
    }

    @Override
    public Class edataClass() {
        return StringType.class;
    }

    public void parse(String s) {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        factory.setNamespaceAware(true);
        DocumentBuilder builder;
        counts = new HashMap<String, Integer>();
        try {
            builder = factory.newDocumentBuilder();
            Document doc = builder.parse(new InputSource(new StringReader(s)));
            XPathFactory xfactory = XPathFactory.newInstance();
            XPath xpath = xfactory.newXPath();
            title = xpath.evaluate("//page/title/text()", doc);
            title = title.replaceAll("\\s", "_");
            // title = title.replaceAll("^[^a-zA-Z0-9]", "#");
            // title = title.replaceAll("[^a-zA-Z0-9.]", "_");
            id = xpath.evaluate("//page/id/text()", doc);
            String text = xpath.evaluate("//page/revision/text/text()", doc);

            if (!text.isEmpty()) {
                Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
                TokenStream stream = analyzer.tokenStream(null, new StringReader(text));
                while (stream.incrementToken()) {
                    String token = stream.getAttribute(TermAttribute.class).term();

                    if (dictionary != null && !dictionary.contains(token))
                        continue;

                    if (counts.containsKey(token))
                        counts.put(token, counts.get(token) + 1);
                    else
                        counts.put(token, 1);
                }
            }
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
        } catch (SAXException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (XPathExpressionException e) {
            e.printStackTrace();
        }
    }

    @Override
    public Iterator<Vertex<StringType, StringType>> getVertices() {
        ArrayList<Vertex<StringType, StringType>> vlist = new ArrayList<Vertex<StringType, StringType>>(
                counts.size() + 1);
        vlist.add(new Vertex<StringType, StringType>(new StringType(id), new StringType(title)));
        Iterator<String> iter = counts.keySet().iterator();
        while (iter.hasNext()) {
            vlist.add(new Vertex<StringType, StringType>(new StringType(iter.next()), new StringType()));
        }
        return vlist.iterator();
    }

    @Override
    public Iterator<Edge<StringType, StringType>> getEdges() {
        if (counts.isEmpty())
            return EmptyIterator.INSTANCE;

        ArrayList<Edge<StringType, StringType>> elist = new ArrayList<Edge<StringType, StringType>>(counts.size());
        Iterator<Entry<String, Integer>> iter = counts.entrySet().iterator();
        while (iter.hasNext()) {
            Entry<String, Integer> e = iter.next();
            elist.add(new Edge<StringType, StringType>(new StringType(id), new StringType(e.getKey()),
                    new StringType(e.getValue().toString())));
        }
        return elist.iterator();
    }

    private void loadDictionary(String path) throws IOException {
        FileStatus[] stats = fs.listStatus(new Path(path));
        dictionary = new HashSet<String>();
        for (FileStatus stat : stats) {
            LOG.debug(("Load dictionary: " + stat.getPath().getName()));
            Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(fs.open(stat.getPath()))));
            while (sc.hasNextLine()) {
                String line = sc.nextLine();
                dictionary.add(line);
            }
        }
    }

    private String title;
    private String id;
    private HashMap<String, Integer> counts;
    private FileSystem fs;
    private HashSet<String> dictionary;

}