Java tutorial
/* Copyright (C) 2012 Intel Corporation. * All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * For more about this software visit: * http://www.01.org/GraphBuilder */ package com.intel.hadoop.graphbuilder.demoapps.wikipedia.linkgraph; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.collections.iterators.EmptyIterator; import org.apache.hadoop.mapred.JobConf; import org.apache.log4j.Logger; import org.w3c.dom.Document; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import com.intel.hadoop.graphbuilder.graph.Edge; import com.intel.hadoop.graphbuilder.graph.Vertex; import com.intel.hadoop.graphbuilder.preprocess.inputformat.GraphTokenizer; import com.intel.hadoop.graphbuilder.types.EmptyType; import com.intel.hadoop.graphbuilder.types.StringType; public class LinkGraphTokenizer implements GraphTokenizer<StringType, EmptyType, EmptyType> { private static final Logger LOG = Logger.getLogger(LinkGraphTokenizer.class); public LinkGraphTokenizer() throws ParserConfigurationException { factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); builder = factory.newDocumentBuilder(); XPathFactory xfactory = XPathFactory.newInstance(); xpath = xfactory.newXPath(); vlist = new ArrayList<Vertex<StringType, EmptyType>>(); elist = new ArrayList<Edge<StringType, EmptyType>>(); links = new ArrayList<String>(); } @Override public void configure(JobConf job) { } @Override public Class vidClass() { return StringType.class; } @Override public Class vdataClass() { return EmptyType.class; } @Override public Class edataClass() { return EmptyType.class; } public void parse(String s) { try { Document doc = builder.parse(new InputSource(new StringReader(s))); title = xpath.evaluate("//page/title/text()", doc); title = title.replaceAll("\\s", "_"); id = xpath.evaluate("//page/id/text()", doc); String text = xpath.evaluate("//page/revision/text/text()", doc); parseLinks(text); } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (XPathExpressionException e) { e.printStackTrace(); } } public Iterator<Vertex<StringType, EmptyType>> getVertices() { vlist.clear(); vlist.add(new Vertex<StringType, EmptyType>(new StringType(title), EmptyType.INSTANCE)); for (String link : links) vlist.add(new Vertex<StringType, EmptyType>(new StringType(link), EmptyType.INSTANCE)); return vlist.iterator(); } @Override public Iterator<Edge<StringType, EmptyType>> getEdges() { if (links.isEmpty()) return EmptyIterator.INSTANCE; elist.clear(); Iterator<String> iter = links.iterator(); while (iter.hasNext()) { elist.add(new Edge<StringType, EmptyType>(new StringType(title), new StringType(iter.next()), EmptyType.INSTANCE)); } return elist.iterator(); } /** This function is taken and modified from wikixmlj WikiTextParser */ private void parseLinks(String text) { links.clear(); Pattern catPattern = Pattern.compile("\\[\\[(.*?)\\]\\]", Pattern.MULTILINE); Matcher matcher = catPattern.matcher(text); while (matcher.find()) { String[] temp = matcher.group(1).split("\\|"); if (temp == null || temp.length == 0) continue; String link = temp[0]; if (!link.replaceAll("\\s", "").isEmpty() && !link.contains(":")) { links.add(link.replaceAll("\\s", "_")); } } } private String id; private String title; private List<String> links; private ArrayList<Vertex<StringType, EmptyType>> vlist; private ArrayList<Edge<StringType, EmptyType>> elist; private DocumentBuilderFactory factory; private DocumentBuilder builder; private XPath xpath; }