Java tutorial
/** * Copyright 2011 Pablo Mendes, Max Jakob * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.dbpedia.spotlight.io; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.store.FSDirectory; import org.dbpedia.spotlight.exceptions.SearchException; import org.dbpedia.spotlight.lucene.LuceneManager; import org.dbpedia.spotlight.lucene.search.LuceneCandidateSearcher; import org.dbpedia.spotlight.model.DBpediaResource; import org.dbpedia.spotlight.model.SpotlightConfiguration; import org.dbpedia.spotlight.model.SurfaceForm; import org.semanticweb.yars.nx.Node; import org.semanticweb.yars.nx.parser.NxParser; import org.semanticweb.yars.nx.parser.ParseException; import java.io.*; import java.util.*; /** * Created by IntelliJ IDEA. * User: PabloMendes * Date: Jul 23, 2010 * Time: 3:53:58 PM * To change this template use File | Settings | File Templates. */ public abstract class DatasetSplitter { Log LOG = LogFactory.getLog(this.getClass()); int incrementalId = 0; Writer mTrainingSetWriter; Writer mTestSetWriter; /** * Abstract constructor. Please see @link{BySize} and @link{BySurfaceForm}} * @param trainingSetFile * @param testSetFile * @throws IOException */ public DatasetSplitter(File trainingSetFile, File testSetFile) throws IOException { this.mTrainingSetWriter = new BufferedWriter(new FileWriter(trainingSetFile)); this.mTestSetWriter = new BufferedWriter(new FileWriter(testSetFile)); } public abstract boolean shouldKeepTheseOccurrences(List<String> items); public abstract void split(List<String> items) throws IOException; //TODO Max: question: does this assume sorting by URI? public void run(InputStream stream) throws IOException { String currentItem = ""; List<String> items = new ArrayList<String>(); Scanner scanner = new Scanner(new InputStreamReader(stream, "UTF-8")); int nItemsKept = 0; while (scanner.hasNext()) { String line = scanner.nextLine(); incrementalId++; if (line == null || line.trim().equals("")) continue; String[] fields = line.split("\t"); String uri; if (fields.length >= 5) { uri = fields[0]; } else { uri = fields[1]; } // String surfaceForm = fields[1]; // String context = fields[2]; // String offset = fields[3]; // String type = fields[4]; //Tuple5<String,String,String,String,String> t = new Tuple5<String,String,String,String,String>(surfaceForm, uri, context, offset, type); if (!uri.equals(currentItem)) { if (shouldKeepTheseOccurrences(items)) { nItemsKept++; LOG.trace("End of current item: " + currentItem + " / size: " + items.size() + " - saving!"); split(items); } // else ignore //reset current item currentItem = uri; items = new ArrayList<String>(); } items.add(line.toString()); if (incrementalId % 50000 == 0) LOG.info("Processed " + incrementalId + " occurrences. Kept occurrences for " + nItemsKept + " URIs."); } scanner.close(); LOG.info("Processed " + incrementalId + " occurrences. Kept occurrences for " + nItemsKept + " URIs"); } // public void run(File f) { // LOG.info("Loading occurrences from "+f.getPath()); // String currentItem = ""; // //Set<Tuple5> items = new HashSet<Tuple5>(); // List<String> items = new ArrayList<String>(); // // if (f.getName().length() != 0) { // try { // FastBufferedReader in = new FastBufferedReader(new FileReader(f)); // MutableString line = new MutableString(); // int i = 0; // while ((line = in.readLine(line)) != null) { // incrementalId++; // // if (line==null || line.trim().equals("")) // continue; // // String[] fields = line.toString().split("\t"); //// String surfaceForm = fields[0]; // String uri = fields[1]; //// String context = fields[2]; //// String offset = fields[3]; //// String type = fields[4]; // // //Tuple5<String,String,String,String,String> t = new Tuple5<String,String,String,String,String>(surfaceForm, uri, context, offset, type); // // if ( !uri.equals(currentItem)){ // if (i >= mMinNumberOfExamples) { // uniformSplit(items); // } // else ignore // //reset current item // currentItem = uri; // items = new ArrayList<String>(); // } // items.add(line.toString()); // i++; // } // in.close(); // // } catch (IOException e) {e.printStackTrace();} // } // LOG.info("Done. Loaded "+items.size()+" items."); // // } public void write(int id, String item, Writer writer) throws IOException { StringBuffer sb = new StringBuffer(); sb.append(id); sb.append("\t"); sb.append(item); sb.append("\n"); writer.write(sb.toString()); } public static class BySize extends DatasetSplitter { int mMinNumberOfExamples = 1; double mPercentSplit = 0.5; public BySize(File trainingSetFile, File testSetFile, int minNumberOfExamples, double percentSplit) throws IOException { super(trainingSetFile, testSetFile); this.mMinNumberOfExamples = minNumberOfExamples; this.mPercentSplit = percentSplit; } @Override public boolean shouldKeepTheseOccurrences(List<String> items) { return items.size() >= mMinNumberOfExamples; } @Override public void split(List<String> items) throws IOException { int i = incrementalId - items.size(); // set int n = (new Double(items.size() * mPercentSplit)).intValue(); for (String item : items) { if ((n > 0) && // When there are enough items for dividing in training and testing (i % (items.size() / n) == 0)) { // For a 10% split, uniformSplit every 10th entry LOG.trace("Writing to test: " + i + " " + items.size() + "/" + n); write(i, item, mTestSetWriter); } else { // For a 10% split, it will write to training 90% of the times, plus // when there are not enough examples to split between training and testing // That should assure that all senses are in training to be picked. LOG.trace("Writing to training: " + i); write(i, item, mTrainingSetWriter); } i++; } } } public static class BySurfaceForm extends BySize { Set<String> mValidSurfaceForms = new HashSet<String>(); public BySurfaceForm(File trainingSetFile, File testSetFile, int minNumberOfExamples, double percentSplit, Set<String> validSurfaceForms) throws IOException { super(trainingSetFile, testSetFile, minNumberOfExamples, percentSplit); mValidSurfaceForms = validSurfaceForms; LOG.info( "Assuming " + validSurfaceForms.size() + " valid surface forms to acquire occurrence samples."); } @Override public boolean shouldKeepTheseOccurrences(List<String> items) { boolean shouldKeep = false; for (String item : items) { StringBuffer sf = new StringBuffer(); try { String[] fields = item.split("\t"); if (fields.length >= 5) { sf = sf.append(fields[2]); } else { sf = sf.append(fields[1]); } } catch (ArrayIndexOutOfBoundsException e) { LOG.debug("Error parsing line: " + item); } for (String validSf : mValidSurfaceForms) { //if (sf.toString().toLowerCase().contains(validSf.toLowerCase())) { // relaxed if (sf.toString().toLowerCase().equals(validSf.toLowerCase())) { // strict shouldKeep = true; LOG.trace("Kept:" + sf + " because it matches " + validSf); break; } } } return shouldKeep; } } /** * TODO created by Max: this functions allows for one call to create "confusable-with" sets * For a given type, goes through the data set that keeps the types for each resource. * If the type matches, look in the surrogate index for this URI (opposite direction as usually) * for all surface forms that can relate to this URI. * Return all surface forms found this way. */ public static Set<String> getConfusableSurfaceForms(String targetType, File instancesFile, LuceneCandidateSearcher surrogateSearcher) throws IOException, ParseException { System.err.println("Getting all surface forms for " + targetType + "s..."); Set<String> surfaceForms = new HashSet<String>(); if (!targetType.startsWith(SpotlightConfiguration.DEFAULT_ONTOLOGY_PREFIX)) targetType = SpotlightConfiguration.DEFAULT_ONTOLOGY_PREFIX + targetType; NxParser parser = new NxParser(new FileInputStream(instancesFile)); while (parser.hasNext()) { Node[] triple = parser.next(); if (triple[2].toString().equals(targetType)) { String targetUri = triple[0].toString().replace(SpotlightConfiguration.DEFAULT_NAMESPACE, ""); try { Set<SurfaceForm> surfaceFormsForURI = surrogateSearcher .getSurfaceForms(new DBpediaResource(targetUri)); for (SurfaceForm sf : surfaceFormsForURI) { surfaceForms.add(sf.name()); } } catch (SearchException e) { System.err.println("URI " + targetUri + " not found in surrogate index. Skipping."); } } } return surfaceForms; } //TODO Make this guy parameterizable from command line. public static void main(String[] args) throws IOException, ParseException { /** * Split dataset in training and test. * percentageSplit indicates how much to save for testing * minSize indicates the minimum number of occurrences a URI has to have for it to make it to training/testing */ int minSize = 2; double percentageSplit = 0.5; String targetType = "Actor"; //"Person"; //Place //Organisation /* Here I'm using wikipediaOccurrences.ambiguous.tsv.gz Be careful here. Do not use withtype because it is a join with the types, so for URIs that have multiple types the same entry is repeated multiple times. */ System.err.println("Making confusable with " + targetType + " data sets."); File inputFile = new File("data/WikipediaOccurrences-IDs-clean_enwiki-20100312.uriSorted.tsv"); File trainingFile = new File("E:/dbpa/data/Person_newSurrogates/wikipediaTraining." + (new Double((1 - percentageSplit) * 100)).intValue() + "." + targetType + ".amb.tsv"); File testFile = new File("E:/dbpa/data/Person_newSurrogates/wikipediaTest." + (new Double(percentageSplit * 100)).intValue() + "." + targetType + ".amb.tsv"); // using the next few lines, to create "confusable-with", split in training and testing File instancesFile = new File("data/dbpedia/instance_types_en.nt"); File surrogateIndexDir = new File("data/SurrogateIndex.TitRedDisOcc.lowerCase"); LuceneManager manager = new LuceneManager.CaseInsensitiveSurfaceForms(FSDirectory.open(surrogateIndexDir)); LuceneCandidateSearcher surrogateSearcher = new LuceneCandidateSearcher(manager, false); Set<String> surfaceForms = getConfusableSurfaceForms(targetType, instancesFile, surrogateSearcher); DatasetSplitter splitter = new BySurfaceForm(trainingFile, testFile, minSize, percentageSplit, surfaceForms); //DatasetSplitter splitter = new BySize(trainingFile, testFile, minSize, percentageSplit); splitter.run(new FileInputStream(inputFile)); //splitter.run(new GZIPInputStream(new FileInputStream(inputFile))); } }