Java tutorial
/* * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ------------------- * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit * http://www.manning.com/ingersoll */ package com.tamingtext.tagrecommender; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLOutputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import javax.xml.stream.XMLStreamWriter; import javax.xml.stream.events.XMLEvent; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; import org.apache.commons.cli2.Option; import org.apache.commons.cli2.OptionException; import org.apache.commons.cli2.builder.ArgumentBuilder; import org.apache.commons.cli2.builder.DefaultOptionBuilder; import org.apache.commons.cli2.builder.GroupBuilder; import org.apache.commons.cli2.commandline.Parser; import org.apache.mahout.common.CommandLineUtil; import org.apache.mahout.common.commandline.DefaultOptionCreator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class ExtractStackOverflowData { private static final Logger log = LoggerFactory.getLogger(ExtractStackOverflowData.class); File inputFile; File trainingOutputFile; File testOutputFile; int trainingDataSize = 100000; int testDataSize = 10000; public static void main(String[] args) { ExtractStackOverflowData si = new ExtractStackOverflowData(); if (si.parseArgs(args)) { si.extract(); } } public boolean parseArgs(String[] args) { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option helpOpt = DefaultOptionCreator.helpOption(); Option inputFileOpt = obuilder.withLongName("inputFile").withRequired(true) .withArgument(abuilder.withName("inputFile").withMinimum(1).withMaximum(1).create()) .withDescription("The input file").withShortName("i").create(); Option trainingOutputOpt = obuilder.withLongName("trainingOutputFile").withRequired(true) .withArgument(abuilder.withName("trainingOutputFile").withMinimum(1).withMaximum(1).create()) .withDescription("The training data output file").withShortName("tr").create(); Option testOutputOpt = obuilder.withLongName("testOutputFile").withRequired(true) .withArgument(abuilder.withName("testOutputFile").withMinimum(1).withMaximum(1).create()) .withDescription("The test data output file").withShortName("te").create(); Option trainingDataSizeOpt = obuilder.withLongName("trainingDataSize").withRequired(false) .withArgument(abuilder.withName("trainingDataSize").withMinimum(1).withMaximum(1).create()) .withDescription("The number of questions to extract for training data").withShortName("trs") .create(); Option testDataSizeOpt = obuilder.withLongName("testDataSize").withRequired(false) .withArgument(abuilder.withName("testDataSize").withMinimum(1).withMaximum(1).create()) .withDescription("The number of questions to extract for training data").withShortName("tes") .create(); Group group = gbuilder.withName("Options").withOption(inputFileOpt).withOption(trainingOutputOpt) .withOption(testOutputOpt).withOption(trainingDataSizeOpt).withOption(testDataSizeOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return false; } inputFile = new File((String) cmdLine.getValue(inputFileOpt)); trainingOutputFile = new File((String) cmdLine.getValue(trainingOutputOpt)); testOutputFile = new File((String) cmdLine.getValue(testOutputOpt)); if (cmdLine.hasOption(trainingDataSizeOpt)) { trainingDataSize = Integer.parseInt((String) cmdLine.getValue(trainingDataSizeOpt)); } if (cmdLine.hasOption(testDataSizeOpt)) { testDataSize = Integer.parseInt((String) cmdLine.getValue(testDataSizeOpt)); } } catch (OptionException e) { log.error("Command-line option Exception", e); CommandLineUtil.printHelp(group); return false; } validate(); return true; } protected void validate() { if (!inputFile.exists()) { throw new IllegalArgumentException("inputFile " + inputFile.getAbsolutePath() + " does not exist"); } if (trainingDataSize < 1) { throw new IllegalArgumentException("trainingDataSize must be 1 or more"); } if (testDataSize < 1) { throw new IllegalArgumentException("testDataSize must be 1 or more"); } Util.validateFileWritable(trainingOutputFile); Util.validateFileWritable(testOutputFile); } public void extract() { XMLInputFactory xif = XMLInputFactory.newInstance(); XMLStreamReader reader = null; InputStream is = null; XMLOutputFactory xof = XMLOutputFactory.newInstance(); XMLStreamWriter writer = null; OutputStream os = null; try { log.info("Reading data from " + inputFile); is = new FileInputStream(inputFile); reader = xif.createXMLStreamReader(is); os = new FileOutputStream(trainingOutputFile); writer = xof.createXMLStreamWriter(os); int trainingDataCount = extractXMLData(reader, writer, trainingDataSize); os.close(); os = new FileOutputStream(testOutputFile); writer = xof.createXMLStreamWriter(os); int testDataCount = extractXMLData(reader, writer, testDataSize); os.close(); log.info("Extracted " + trainingDataCount + " rows of training data"); log.info("Extracted " + testDataCount + " rows of test data"); } catch (XMLStreamException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** Extract as many as <code>limit</code> questions from the <code>reader</code> * provided, writing them to <code>writer</code>. * @param reader * @param writer * @param limit * @return * @throws XMLStreamException */ protected int extractXMLData(XMLStreamReader reader, XMLStreamWriter writer, int limit) throws XMLStreamException { int questionCount = 0; int attrCount; boolean copyElement = false; writer.writeStartDocument(); writer.writeStartElement("posts"); writer.writeCharacters("\n"); while (reader.hasNext() && questionCount < limit) { switch (reader.next()) { case XMLEvent.START_ELEMENT: if (reader.getLocalName().equals("row")) { attrCount = reader.getAttributeCount(); for (int i = 0; i < attrCount; i++) { // copy only the questions. if (reader.getAttributeName(i).getLocalPart().equals("PostTypeId") && reader.getAttributeValue(i).equals("1")) { copyElement = true; break; } } if (copyElement) { writer.writeCharacters(" "); writer.writeStartElement("row"); for (int i = 0; i < attrCount; i++) { writer.writeAttribute(reader.getAttributeName(i).getLocalPart(), reader.getAttributeValue(i)); } writer.writeEndElement(); writer.writeCharacters("\n"); copyElement = false; questionCount++; } } break; } } writer.writeEndElement(); writer.writeEndDocument(); writer.flush(); writer.close(); return questionCount; } }