Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package edu.ucla.cs.scai.canali.core.index; import edu.ucla.cs.scai.canali.core.index.tokens.PropertyToken; import edu.ucla.cs.scai.canali.core.index.tokens.ClassToken; import edu.ucla.cs.scai.canali.core.index.tokens.EntityToken; import edu.ucla.cs.scai.canali.core.index.tokens.IndexedToken; import static edu.ucla.cs.scai.canali.core.index.tokens.LiteralToken.*; import edu.ucla.cs.scai.canali.core.index.tokens.OntologyElementToken; import edu.ucla.cs.scai.canali.core.index.utils.Trie; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.ObjectOutputStream; import java.io.PrintWriter; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import org.apache.commons.lang.StringEscapeUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.FSDirectory; /** * * @author Giuseppe M. Mazzeo <mazzeo@cs.ucla.edu> * * This class contains the function for creating a Lucene directory, that can be * used by the AQUA system, starting from an ontology stored in the following * text files (the format for each file is specified in the method using it) * * triples: contain the triples subject, property, value * * property_labels: each row contains and property URI and a label - the same * property URI can appear on multiple rows (multiple labels) * * class_labels: each row contains a class URI and a label - the same * class URI can appear on multiple rows (multiple labels) * * class_parents each row contains a class URI and the URI of one of its * class parents * * entity_labels: each row contains an entity URI and a label - the same entity * URI can appear on multiple rows (multiple labels) * * entity_classes: each row contains an entity URI and the URI of one of its * classes * * basic_types_literal_types: each row contains a basic type URI and either * Double, Date, String, or Boolean * * additional_property_labels: other property labels * * additional_class_labels: other class labels * * additional_entity_labels: other class labels */ public class BuildIndex { String basePathInput, basePathOutput; int[] entityTriplesSubjects; int[] entityTriplesProperties; int[] entityTriplesValues; //ArrayList<int[]> entityTriples = new ArrayList<>(); //HashMap<String, ArrayList<int[]>> literalTriples = new HashMap<>(); HashMap<String, int[]> literalTriplesSubjects = new HashMap<>(); HashMap<String, int[]> literalTriplesProperties = new HashMap<>(); HashSet<String> literalTypes = new HashSet<>(); HashMap<String, Integer> entityIdFromUriWithPrefix = new HashMap<>(); HashMap<String, Integer> propertyIdFromUri = new HashMap<>(); HashMap<String, Integer> classIdFromUri = new HashMap<>(); String[] entityUriWithPrefix; String[] propertyUri; String[] classUri; HashSet<String>[] propertyLabels; HashSet<String>[] entityLabels; HashSet<String>[] classLabels; HashSet<Integer>[] entityClasses; HashMap<String, String> basicTypesMapping = new HashMap<>(); HashSet<Integer>[] classParents; HashSet<Integer>[] classAncestors; HashSet<Integer>[] classChildren; HashSet<Integer>[] classDescendants; HashSet<Integer>[] entityOutProperties; HashSet<Integer>[] entityInProperties; HashSet<Integer>[] classOutProperties; HashSet<Integer>[] classInProperties; HashSet<Integer>[] propertyInProperties; HashSet<Integer>[] propertyOutProperties; int[] propertyCount; boolean[] propertyHasLiteralRange; HashMap<String, HashSet<Integer>> literalTypesInProperties = new HashMap<>(); int iEntityTriples = 0; HashMap<String, Integer> iLiteralTriples = new HashMap<>(); public final static String THING = "http://www.w3.org/2002/07/owl#Thing"; int thingId; boolean printFiles = false; int minPropertyLength = 2; static final HashMap<String, String> uriToPrefix = new HashMap<>(); static final HashMap<String, String> prefixToUri = new HashMap<>(); static final boolean DROP_UNLABELED_ENTITIES = false; static { uriToPrefix.put("http://musicbrainz.org/area/", "1:"); uriToPrefix.put("http://musicbrainz.org/artist/", "2:"); uriToPrefix.put("http://musicbrainz.org/artist/", "3:"); uriToPrefix.put("http://musicbrainz.org/record/", "4:"); uriToPrefix.put("http://musicbrainz.org/place/", "5:"); uriToPrefix.put("http://musicbrainz.org/recording/", "6:"); uriToPrefix.put("http://musicbrainz.org/place/", "7:"); uriToPrefix.put("http://musicbrainz.org/place/", "8:"); uriToPrefix.put("http://musicbrainz.org/tag/", "9:"); uriToPrefix.put("http://musicbrainz.org/track/", "a:"); uriToPrefix.put("http://musicbrainz.org/work/", "b:"); for (Map.Entry<String, String> e : uriToPrefix.entrySet()) { prefixToUri.put(e.getValue(), e.getKey()); } } public BuildIndex(String basePathInput, String basePathOutput) { if (!basePathInput.endsWith(File.separator)) { basePathInput += File.separator; } this.basePathInput = basePathInput; if (!basePathOutput.endsWith(File.separator)) { basePathOutput += File.separator; } this.basePathOutput = basePathOutput; literalTypesInProperties.put(DOUBLE, new HashSet<Integer>()); literalTypesInProperties.put(STRING, new HashSet<Integer>()); literalTypesInProperties.put(DATE, new HashSet<Integer>()); literalTypesInProperties.put(BOOLEAN, new HashSet<Integer>()); //literalTriples.put(STRING, new ArrayList<int[]>()); //literalTriples.put(BOOLEAN, new ArrayList<int[]>()); //literalTriples.put(DOUBLE, new ArrayList<int[]>()); //literalTriples.put(DATE, new ArrayList<int[]>()); literalTypes.add(STRING); literalTypes.add(BOOLEAN); literalTypes.add(DOUBLE); literalTypes.add(DATE); for (String type : literalTypes) { iLiteralTriples.put(type, 0); } } private Integer getEntityIdFromUri(String uri) { for (String p : uriToPrefix.keySet()) { if (uri.startsWith(p)) { uri = uri.replace(p, uriToPrefix.get(p)); break; } } return entityIdFromUriWithPrefix.get(uri); } private String getEntityCompleteUri(String uri) { for (String p : prefixToUri.keySet()) { if (uri.startsWith(p)) { return uri.replace(p, prefixToUri.get(p)); } } return uri; } private void putEntityIdFromUri(String uri, int id) { for (String p : uriToPrefix.keySet()) { if (uri.startsWith(p)) { uri = uri.replace(p, uriToPrefix.get(p)); break; } } entityIdFromUriWithPrefix.put(uri, id); } private void updateTriples(String subj, String attr, String entityVal, String literalType) { Integer idSbj = getEntityIdFromUri(subj);//entityIdFromUri.get(subj); Integer idAttr = propertyIdFromUri.get(attr); if (entityVal != null) { Integer idVal = getEntityIdFromUri(entityVal);//entityIdFromUri.get(entityVal); entityTriplesSubjects[iEntityTriples] = idSbj; entityTriplesProperties[iEntityTriples] = idAttr; entityTriplesValues[iEntityTriples] = idVal; iEntityTriples++; //now, create the inverted triple Integer idInvAttr = propertyIdFromUri.get(attr + "Inv"); entityTriplesSubjects[iEntityTriples] = idVal; entityTriplesProperties[iEntityTriples] = idInvAttr; entityTriplesValues[iEntityTriples] = idSbj; iEntityTriples++; } else { int pos = iLiteralTriples.get(literalType); literalTriplesSubjects.get(literalType)[pos] = idSbj; literalTriplesProperties.get(literalType)[pos] = idAttr; iLiteralTriples.put(literalType, pos + 1); } } /* The file triples must contain one row per triple <subject, property, value>, with the form <uri_subject> <uri_property> <uri_value> or <uri_subject> <uri_property> "value" or <uri_subject> <uri_property> "value"<basic_type> e.g. <http://dbpedia.org/resource/01_Communique> <http://dbpedia.org/ontology/industry> <http://dbpedia.org/resource/Software> or <http://dbpedia.org/resource/Kikuzo_Kisaka> <http://xmlns.com/foaf/0.1/name> "Kisaka, Kikuzo"@en or <http://dbpedia.org/resource/Kiko_Casilla> <http://dbpedia.org/ontology/alias> "Casilla, Francisco" or <http://dbpedia.org/resource/Kiki_S%C3%B8rum> <http://dbpedia.org/ontology/birthDate> "1939-01-16"^^<http://www.w3.org/2001/XMLSchema#date> Triples whose value is an entity are loaded into a list of of arrays of integers containing 3 elements: subject id, property id, value id. Ids are assigned as new entities are processed. Triples whose value is a literal are loaded into one of 4 lists of arrays of integers, depending on the type of literal, containing 2 elements: subject id, property id. Entities and propertys that are not found in this file will be ignored in the following. */ private void loadTriples() throws Exception { HashMap<String, Integer> propertyFrequency = new HashMap<>(); HashSet<String> shortProperties = new HashSet<>(); if (minPropertyLength > 1) { System.out.println( "Finding propertys to be ignored because they have lenght less than " + minPropertyLength); int i = 0; try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + "property_labels"))) { String l; while ((l = in.readLine()) != null) { i++; if (l.length() > 0) { try { StringTokenizer st = new StringTokenizer(l, "\t<> "); String uri = st.nextToken().trim(); if (uri.startsWith("http")) { String label = st.hasMoreTokens() ? st.nextToken().trim() : ""; if (label.length() < minPropertyLength && !shortProperties.contains(uri)) { shortProperties.add(uri); System.out .println("Property " + uri + " will be ignored, having label " + label); propertyFrequency.put(uri, 0); } } } catch (Exception e) { System.out.println("Error at line " + i + ": " + l); e.printStackTrace(); } } } } System.out.println(shortProperties.size() + " propertys will be ignored, having lenght less than " + minPropertyLength); } int maxNumberOfProperties = 100000; System.out.println("Finding the the " + maxNumberOfProperties + " most frequent propertys of the propertys whose label has at least two characters"); try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + "triples"))) { String l = in.readLine(); int n = 0; while (l != null && l.length() > 0) { if (l.contains("classDegree")) { System.out.print(""); } StringTokenizer st = new StringTokenizer(l, "<> \t"); String subject = st.nextToken(); String property = st.nextToken(); String value = st.nextToken(); if (subject.startsWith("http") && property.startsWith("http") && !shortProperties.contains(property)) { if (value.startsWith("http") || value.startsWith("ftp:")) { //it is an entity Integer c = propertyFrequency.get(property); if (c == null) { propertyFrequency.put(property, 1); } else { propertyFrequency.put(property, 1 + c); } } else { //it is a literal if (value.endsWith("^^")) { //it is a basic type String type = StringEscapeUtils.unescapeJava(st.nextToken()); String literalType = basicTypesMapping.get(type); if (literalType != null) { Integer c = propertyFrequency.get(property); if (c == null) { propertyFrequency.put(property, 1); } else { propertyFrequency.put(property, 1 + c); } } else { System.out.println("Basic type not recognized in " + l); } } else { if (value.startsWith("\"")) { //it is a String Integer c = propertyFrequency.get(property); if (c == null) { propertyFrequency.put(property, 1); } else { propertyFrequency.put(property, 1 + c); } } else { System.out.println("Basic type not recognized in " + l); } } } n++; if (n % 1000000 == 0) { System.out.println("Scanned " + (n / 1000000) + "M triples"); } } else { //System.out.println("Invalid triple: " + l); } l = in.readLine(); } } shortProperties = null; System.gc(); ArrayList<Map.Entry<String, Integer>> f = new ArrayList<>(propertyFrequency.entrySet()); Collections.sort(f, new Comparator<Map.Entry<String, Integer>>() { @Override public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) { return Integer.compare(o2.getValue(), o1.getValue()); } }); int minFreq = 1; if (f.size() > maxNumberOfProperties) { minFreq = f.get(maxNumberOfProperties - 1).getValue(); if (f.get(maxNumberOfProperties).equals(f.get(maxNumberOfProperties - 1))) { minFreq++; } } for (Map.Entry<String, Integer> e : f) { System.out.println(e.getKey() + "\t" + e.getValue()); } System.out.println("Keeping propertys with at least " + minFreq + " occurrences"); HashSet<String> acceptedProperties = new HashSet<>(); for (Map.Entry<String, Integer> e : propertyFrequency.entrySet()) { if (e.getValue() >= minFreq) { acceptedProperties.add(e.getKey()); } } System.out.println(acceptedProperties.size() + " propertys kept over " + f.size()); f = null; propertyFrequency = null; System.gc(); System.out.println("Mapping entities and property URIs to ids"); int nEntityTriples = 0; HashMap<String, Integer> nLiteralTriples = new HashMap<>(); for (String type : literalTypes) { nLiteralTriples.put(type, 0); } HashSet<String> unrecognizedBasicTypes = new HashSet<>(); //count entity-valued and literal-valued triples //and //create the association between uris and ids for entities try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + "triples"))) { String l = in.readLine(); int n = 0; while (l != null && l.length() > 0) { StringTokenizer st = new StringTokenizer(l, "<> \t"); String subject = st.nextToken(); String property = st.nextToken(); if (!acceptedProperties.contains(property)) { l = in.readLine(); continue; } String value = st.nextToken(); if (subject.startsWith("http") && property.startsWith("http")) { Integer idSbj = getEntityIdFromUri(subject); //entityIdFromUri.get(subject); if (idSbj == null) { idSbj = entityIdFromUriWithPrefix.size() + 1;//entityIdFromUri.size() + 1; putEntityIdFromUri(subject, idSbj); //entityIdFromUri.put(subject, idSbj); } Integer idAttr = propertyIdFromUri.get(property); if (idAttr == null) { idAttr = propertyIdFromUri.size() + 1; propertyIdFromUri.put(property, idAttr); } if (value.startsWith("http") || value.startsWith("ftp:")) { //it is an entity Integer idVal = getEntityIdFromUri(value); //entityIdFromUri.get(value); if (idVal == null) { idVal = entityIdFromUriWithPrefix.size() + 1;//entityIdFromUri.size() + 1; putEntityIdFromUri(value, idVal);//entityIdFromUri.put(value, idVal); } Integer idInvAttr = propertyIdFromUri.get(property + "Inv"); if (idInvAttr == null) { idInvAttr = propertyIdFromUri.size() + 1; propertyIdFromUri.put(property + "Inv", idInvAttr); } nEntityTriples += 2; } else { //it is a literal if (value.endsWith("^^")) { //it is a basic type String type = StringEscapeUtils.unescapeJava(st.nextToken()); String literalType = basicTypesMapping.get(type); if (literalType != null) { nLiteralTriples.put(literalType, nLiteralTriples.get(literalType) + 1); } else { if (!unrecognizedBasicTypes.contains(type)) { System.out.println("Unrecognized type: " + type); System.out.println("in line: " + l); unrecognizedBasicTypes.add(type); } } } else { if (value.startsWith("\"")) { //it is a String nLiteralTriples.put(STRING, nLiteralTriples.get(STRING) + 1); } } } n++; if (n % 1000000 == 0) { System.out.println("Loaded " + (n / 1000000) + "M triples"); } } else { System.out.println("Invalid triple: " + l); } l = in.readLine(); } } System.out.println("Number of triples with entity value: " + nEntityTriples); for (String type : literalTypes) { System.out.println("Number of triples with " + type + " value: " + nLiteralTriples.get(type)); } entityTriplesSubjects = new int[nEntityTriples]; entityTriplesProperties = new int[nEntityTriples]; entityTriplesValues = new int[nEntityTriples]; for (String type : literalTypes) { literalTriplesSubjects.put(type, new int[nLiteralTriples.get(type)]); literalTriplesProperties.put(type, new int[nLiteralTriples.get(type)]); } //load the triples into the arrays creaded above System.out.println("Loading triples"); try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + "triples"))) { String l = in.readLine(); int n = 0; while (l != null && l.length() > 0) { StringTokenizer st = new StringTokenizer(l, "<> \t"); String sbj = st.nextToken(); String attr = st.nextToken(); if (!acceptedProperties.contains(attr)) { l = in.readLine(); continue; } String val = st.nextToken(); if (sbj.startsWith("http") && attr.startsWith("http")) { if (val.startsWith("http") || val.startsWith("ftp:")) { //it is an entity updateTriples(sbj, attr, val, null); } else { //it is a literal if (val.endsWith("^^")) { //it is a basic type String type = StringEscapeUtils.unescapeJava(st.nextToken()); String literalType = basicTypesMapping.get(type); if (literalType != null) { updateTriples(sbj, attr, null, literalType); } else { if (!unrecognizedBasicTypes.contains(type)) { System.out.println("Unrecognized type: " + type); System.out.println("in line: " + l); unrecognizedBasicTypes.add(type); } } } else { if (val.startsWith("\"")) { //it is a String updateTriples(sbj, attr, null, STRING); } else { System.out.println("Unexpected line: " + l); } } } n++; if (n % 1000000 == 0) { System.out.println("Loaded " + (n / 1000000) + "M triples"); } } else { System.out.println("Invalid triple: " + l); } l = in.readLine(); } } System.out.println("Entity value triples: " + entityTriplesSubjects.length); for (String type : literalTriplesSubjects.keySet()) { System.out.println(type + " value triples: " + literalTriplesSubjects.get(type).length); } propertyUri = new String[propertyIdFromUri.size() + 1]; for (Map.Entry<String, Integer> e : propertyIdFromUri.entrySet()) { propertyUri[e.getValue()] = e.getKey(); } entityUriWithPrefix = new String[entityIdFromUriWithPrefix.size() + 1]; for (Map.Entry<String, Integer> e : entityIdFromUriWithPrefix.entrySet()) { entityUriWithPrefix[e.getValue()] = e.getKey(); } //entityUri = new String[entityIdFromUri.size() + 1]; //for (Map.Entry<String, Integer> e : entityIdFromUri.entrySet()) { // entityUri[e.getValue()] = e.getKey(); //} entityLabels = new HashSet[entityIdFromUriWithPrefix.size() + 1]; //entityLabels = new HashSet[entityIdFromUri.size() + 1]; entityClasses = new HashSet[entityIdFromUriWithPrefix.size() + 1]; //entityClasses = new HashSet[entityIdFromUri.size() + 1]; propertyLabels = new HashSet[propertyIdFromUri.size() + 1]; entityOutProperties = new HashSet[entityIdFromUriWithPrefix.size() + 1]; //entityOutProperties = new HashSet[entityIdFromUri.size() + 1]; entityInProperties = new HashSet[entityIdFromUriWithPrefix.size() + 1]; //entityInProperties = new HashSet[entityIdFromUri.size() + 1]; propertyOutProperties = new HashSet[propertyIdFromUri.size() + 1]; propertyInProperties = new HashSet[propertyIdFromUri.size() + 1]; propertyHasLiteralRange = new boolean[propertyIdFromUri.size() + 1]; propertyCount = new int[propertyIdFromUri.size() + 1]; } /* The file property_labels must contain one row per property, with the form uri \t label e.g. http://dbpedia.org/ontology/wimbledonMixed wimbledon mixed uris and labels can be inside angular brackets, e.g. <http://dbpedia.org/ontology/wimbledonMixed> <wimbledon mixed> or <http://dbpedia.org/ontology/wimbledonMixed> wimbledon mixed or http://dbpedia.org/ontology/wimbledonMixed <wimbledon mixed> no other separators are supposed to be used The labels of propertys that are not used in triples are ignored */ private void processePropertyLabelsFile(String fileName) throws Exception { try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + fileName))) { String l = in.readLine(); while (l != null) { if (l.length() > 0) { StringTokenizer st = new StringTokenizer(l, "\t<>"); String uri = st.nextToken().trim(); Integer id = propertyIdFromUri.get(uri); if (id != null) { //we ignore the labels of propertys not used in triples try { String label = st.nextToken().trim(); if (label.length() > 1) { //System.out.println(uri + "\t" + label); if (propertyLabels[id] == null) { propertyLabels[id] = new HashSet<>(); } propertyLabels[id].add(label); if (uri.endsWith("Inv")) { System.out.println("Label \"" + label + "\" for inverted property " + uri); } Integer idInv = propertyIdFromUri.get(uri + "Inv"); if (idInv != null) { if (propertyLabels[idInv] == null) { propertyLabels[idInv] = new HashSet<>(); } propertyLabels[idInv].add(label + " [inverted]"); } } } catch (Exception e) { e.printStackTrace(); System.out.println("Line: " + l); } } } l = in.readLine(); } } } private void loadPropertyLabels() throws Exception { System.out.println("Loading property labels"); processePropertyLabelsFile("property_labels"); System.out.println("Loading additional property labels"); try { processePropertyLabelsFile("additional_property_labels"); } catch (Exception e) { e.printStackTrace(); } //now, we drop the propertys without a label from the map of uri -> id for (int i = 1; i < propertyLabels.length; i++) { if (propertyLabels[i] == null) { propertyIdFromUri.remove(propertyUri[i]); propertyUri[i] = null; } } } /* The file class_labels must contain one row per class, with the form uri \t label e.g. http://dbpedia.org/ontology/ProtectedArea protected area uris and labels can be inside angular brackets, e.g. <http://dbpedia.org/ontology/ProtectedArea> <protected area> or <http://dbpedia.org/ontology/ProtectedArea> protected area or http://dbpedia.org/ontology/ProtectedArea <protected area> no other separators are supposed to be used */ private void processClassLabelsFile(String fileName, ArrayList<HashSet<String>> labels) throws Exception { try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + fileName))) { String l = in.readLine(); while (l != null) { if (l.length() > 0) { StringTokenizer st = new StringTokenizer(l, "\t<>"); try { String uri = st.nextToken().trim(); String label = st.nextToken().trim(); //System.out.println(uri + "\t" + label); if (!classIdFromUri.containsKey(uri)) { classIdFromUri.put(uri, labels.size() + 1); labels.add(new HashSet<String>()); labels.get(labels.size() - 1).add(label); } else { labels.get(classIdFromUri.get(uri) - 1).add(label); } } catch (Exception e) { System.out.println("Error with line " + l); e.printStackTrace(); } } l = in.readLine(); } } } private void loadClassLabels() throws Exception { ArrayList<HashSet<String>> labels = new ArrayList<>(); classIdFromUri.put(THING, 1); thingId = 1; labels.add(new HashSet<String>()); labels.get(0).add("thing"); System.out.println("Loading class labels"); processClassLabelsFile("class_labels", labels); try { processClassLabelsFile("additional_class_labels", labels); } catch (Exception e) { e.printStackTrace(); } classLabels = new HashSet[labels.size() + 1]; int i = 1; for (HashSet<String> l : labels) { classLabels[i] = l; i++; } classUri = new String[classIdFromUri.size() + 1]; for (Map.Entry<String, Integer> e : classIdFromUri.entrySet()) { classUri[e.getValue()] = e.getKey(); } } /* The file class_parents must contain one or more rows per class, with the form uri \t uri_parent e.g. http://dbpedia.org/ontology/Actor http://dbpedia.org/ontology/Artist uris can be inside angular brackets, e.g. <http://dbpedia.org/ontology/Actor> <http://dbpedia.org/ontology/Artist> or <http://dbpedia.org/ontology/Actor> http://dbpedia.org/ontology/Artist or http://dbpedia.org/ontology/Actor <http://dbpedia.org/ontology/Artist> no other separators are supposed to be used */ private void loadClassHierarchy() throws Exception { System.out.println("Loading class parents and building the hierarchy"); //firs, we initialize class parents classParents = new HashSet[classIdFromUri.size() + 1]; for (int i = 1; i < classParents.length; i++) { classParents[i] = new HashSet<>(); //we don't initialize class ancestors because the null value is used to check if the class has not been processed yet } try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + "class_parents"))) { String l = in.readLine(); while (l != null) { if (l.length() > 0) { StringTokenizer st = new StringTokenizer(l, "\t<>"); String claz = st.nextToken().trim(); String parent = st.nextToken().trim(); //we are interested only in the hierarchical relationships between //classes defined inside our ontology Integer cId = classIdFromUri.get(claz); Integer pId = classIdFromUri.get(parent); if (cId != null && pId != null && !pId.equals(cId)) { classParents[cId].add(pId); } } l = in.readLine(); } //now add Thing to empty sets of parents for (int cId = 1; cId < classParents.length; cId++) { if (classParents[cId].isEmpty()) { classParents[cId].add(thingId); } } classParents[thingId].clear(); //now, for each class compute the set of its ancestors classAncestors = new HashSet[classIdFromUri.size() + 1]; for (int cId = 1; cId < classAncestors.length; cId++) { computeClassAncestors(cId); } //now, reduce the set of class parents, by keeping only the most specific classes for (int cId = 1; cId < classParents.length; cId++) { HashSet<Integer> currentParents = classParents[cId]; HashSet<Integer> reducedParents = new HashSet<>(); for (Integer pId : currentParents) { //check if reducedParents contains an ancestor of parent, //or if parent is an ancestor of any class in reducedParents boolean add = true; for (Iterator<Integer> it = reducedParents.iterator(); it.hasNext();) { Integer c = it.next(); if (classAncestors[c].contains(pId)) { add = false; //we don't add parent, beacause c is a descendant of parent break; } else if (classAncestors[pId].contains(c)) { it.remove(); //we remove c beacause parent is a descendant of c } } if (add) { reducedParents.add(pId); } } classParents[cId] = reducedParents; } //now, compute the class children for each class classChildren = new HashSet[classIdFromUri.size() + 1]; for (int cId = 1; cId < classChildren.length; cId++) { classChildren[cId] = new HashSet<>(); } for (int cId = 1; cId < classParents.length; cId++) { for (Integer pId : classParents[cId]) { classChildren[pId].add(cId); } } //now compute the class descendants for each class classDescendants = new HashSet[classIdFromUri.size() + 1]; for (int cId = 1; cId < classDescendants.length; cId++) { computeClassDescendants(cId); } } } /* Compute the set of ancestors of a class */ private void computeClassAncestors(int cId) { if (classAncestors[cId] != null) { return; //it was already computed } classAncestors[cId] = new HashSet<>(); for (Integer pId : classParents[cId]) { //the parent is an ancestor classAncestors[cId].add(pId); computeClassAncestors(pId); //and the ancestors of the parent are ancestors as well classAncestors[cId].addAll(classAncestors[pId]); } } /* Compute the set of descendants of a class */ private void computeClassDescendants(int cId) { if (classDescendants[cId] != null) { return; //it was already computed } classDescendants[cId] = new HashSet<>(); for (Integer pId : classChildren[cId]) { //the parent is an ancestor classDescendants[cId].add(pId); computeClassDescendants(pId); //and the ancestors of the parent are ancestors as well classDescendants[cId].addAll(classDescendants[pId]); } } /* The file entity_labels must contain one row per entity, with the form uri \t label e.g. http://dbpedia.org/resource/Agatha_Christie Agatha Christie uris can be inside angular brackets, e.g. <http://dbpedia.org/resource/Agatha_Christie> <Agatha Christie> or <http://dbpedia.org/resource/Agatha_Christie> Agatha Christie or http://dbpedia.org/resource/Agatha_Christie <Agatha Christie> no other separators are supposed to be used */ private void processEntityLabelsFile(String fileName) throws Exception { try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + fileName))) { String l = in.readLine(); while (l != null) { if (l.length() > 0) { StringTokenizer st = new StringTokenizer(l, "\t<>"); String uri = st.nextToken(); Integer id = getEntityIdFromUri(uri); //entityIdFromUri.get(uri); if (id != null) { //we ignore the labels of entities not used in triples try { String label = st.nextToken(); //System.out.println(uri + "\t" + label); if (entityLabels[id] == null) { entityLabels[id] = new HashSet<>(); } entityLabels[id].add(label); } catch (Exception e) { System.out.println("Failed to add label: " + l); } } else { //System.out.println("Ignored label of "+uri); } } l = in.readLine(); } } } private void loadEntityLabels() throws Exception { System.out.println("Loading entity labels"); processEntityLabelsFile("entity_labels"); try { processEntityLabelsFile("additional_entity_labels"); } catch (Exception e) { e.printStackTrace(); } //now, we drop the entities without a label from the map of uri -> id for (int i = 1; i < entityLabels.length; i++) { if (entityLabels[i] == null) { if (DROP_UNLABELED_ENTITIES) { entityIdFromUriWithPrefix.remove(entityUriWithPrefix[i]);//entityIdFromUri.remove(entityUri[i]); entityUriWithPrefix[i] = null; //entityUri[i] = null; } else { //create a label entityLabels[i] = new HashSet<>(); entityLabels[i].add(entityUriWithPrefix[i]); } } } } /* The file entity_classes must contain one row per pair <entity, class>, with the form uri_entity \t uri_class e.g. http://dbpedia.org/resource/Autism http://dbpedia.org/ontology/Disease uris can be inside angular brackets, e.g. <http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/Disease> or <http://dbpedia.org/resource/Autism> http://dbpedia.org/ontology/Disease or http://dbpedia.org/resource/Autism <http://dbpedia.org/ontology/Disease> no other separators are supposed to be used */ private void loadEntityClasses() throws Exception { System.out.println("Loading entity classes, and keeping only the most specific"); int count = 0; HashSet<Integer> notEmptyClasses = new HashSet<>(); try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + "entity_classes"))) { String l = in.readLine(); while (l != null) { try { StringTokenizer st = new StringTokenizer(l, "\t<>"); String uriE = st.nextToken(); String uriC = st.nextToken(); Integer idE = getEntityIdFromUri(uriE);//entityIdFromUri.get(uriE); Integer idC = classIdFromUri.get(uriC); if (!uriC.equals(THING) && idE != null && idC != null && entityLabels[idE] != null && classLabels[idC] != null) { //we ignore the classes without label and the classes of entities not used in triples //we also ignore thing as class, since every entity is implicitly a thing HashSet<Integer> classes = entityClasses[idE]; if (classes == null) { classes = new HashSet<>(); entityClasses[idE] = classes; count++; } //check if classes contains an ancestor of uriC, //or if uriC is an ancestor of any class in classes boolean add = true; for (Iterator<Integer> it = classes.iterator(); it.hasNext();) { Integer c = it.next(); if (classAncestors[c].contains(idC)) { add = false; //we don't add class, beacause c is a descendant of class break; } else if (classAncestors[idC].contains(c)) { it.remove(); //we remove c beacause uriC is a descendant of c } } if (add) { classes.add(idC); notEmptyClasses.add(idC); } } } catch (Exception e) { System.out.println("Failed to load class: " + l); } l = in.readLine(); } } System.out.println(count + " entities have been assigned a non-thing class"); count = 0; //now, set Thing as class of entities without a class for (int i = 1; i < entityClasses.length; i++) { if (entityClasses[i] == null && entityLabels[i] != null) { entityClasses[i] = new HashSet<>(); entityClasses[i].add(thingId); count++; } } System.out.println(count + " entities have been assigned thing class"); //now drop the classes without entities and without descendant classes - asking for those classes would produce empty results could confuse the user for (int i = 1; i < classLabels.length; i++) { if (i != thingId && !notEmptyClasses.contains(i) && classDescendants[i].isEmpty()) { classLabels[i] = null; classUri[i] = null; } } } /* The file basic_type_literal_type must contain one row per basic type, with the form uri \t literal_type where literal type can be Double, Date, String, Boolean e.g. http://dbpedia.org/datatype/centimetre Double uri and basic type can be inside angular brackets, e.g. <http://dbpedia.org/datatype/centimetre> <Double> or <http://dbpedia.org/datatype/centimetre> <Double> or <http://dbpedia.org/datatype/centimetre> <Double> no other separators are supposed to be used */ private void loadBasicTypesMapping() throws Exception { System.out.println("Loading basic types mappings"); try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + "basic_types_literal_types"))) { String l = in.readLine(); while (l != null) { StringTokenizer st = new StringTokenizer(l, "\t<>"); String uri = st.nextToken(); String literal = st.nextToken(); //System.out.println(uri + "\t" + label); basicTypesMapping.put(uri, literal); l = in.readLine(); } } } //thios not used for the following reason: if an entity of class C has a property p, then C will have the property p - if no entities of C have property p, we can safely avoid to ask for "p of C", even if a superclass of C has p - in fact, no result would be returned private void propagatePropertiesToDescendantClasses(HashSet<Integer>[] classProperties, int claz) { HashSet<Integer> properties = classProperties[claz]; if (properties == null) { properties = new HashSet<>(); classProperties[claz] = properties; } for (Integer child : classChildren[claz]) { if (classProperties[child] == null) { classProperties[child] = new HashSet<>(); } classProperties[child].addAll(properties); propagatePropertiesToDescendantClasses(classProperties, child); } } private void propagatePropertiesToAncestorClasses(HashSet<Integer>[] classProperties, int claz) { for (int childClass : classChildren[claz]) { propagatePropertiesToAncestorClasses(classProperties, childClass); } HashSet<Integer> properties = classProperties[claz]; if (properties == null) { properties = new HashSet<>(); classProperties[claz] = properties; } for (Integer child : classChildren[claz]) { properties.addAll(classProperties[child]); } } private HashSet<Integer> findCommonLowestAncestor(int c1, int c2) { HashSet<Integer> res = new HashSet<>(); if (c1 == c2) { res.add(c1); return res; } if (classAncestors[c1].contains(c2)) { res.add(c2); return res; } else if (classAncestors[c2].contains(c1)) { res.add(c1); return res; } //find common ancestors HashSet<Integer> temp = new HashSet<>(); for (int a : classAncestors[c1]) { if (classAncestors[c2].contains(a)) { temp.add(a); } } for (int candidateClass : temp) { boolean add = true; for (Iterator<Integer> it = res.iterator(); it.hasNext();) { int existingClass = it.next(); if (classAncestors[existingClass].contains(candidateClass)) { //candidate class is an ancestor of existing class add = false; break; } else if (classAncestors[candidateClass].contains(existingClass)) { //existing class is an ancestor of candodate class it.remove(); //don't break - candidate class could be the ancestor of other existing classes in res } } if (add) { res.add(candidateClass); } } return res; } private HashSet<Integer> findCommonLowestAncestor(Set<Integer> classes) { HashSet<Integer> finalClasses = new HashSet<>(classes); while (finalClasses.size() > 1) { Iterator<Integer> it = finalClasses.iterator(); int c1 = it.next(); it.remove(); int c2 = it.next(); it.remove(); HashSet<Integer> cas = findCommonLowestAncestor(c1, c2); for (int ca : cas) { finalClasses.add(ca); } } return finalClasses; } private void updateOutAndInEntityAndLiteralTypeProperties(Integer property, Integer subj, Integer entityVal, String literalType) { if (entityOutProperties[subj] == null) { entityOutProperties[subj] = new HashSet<>(); } entityOutProperties[subj].add(property); if (entityVal != null) { if (entityInProperties[entityVal] == null) { entityInProperties[entityVal] = new HashSet<>(); } entityInProperties[entityVal].add(property); } if (literalType != null) { literalTypesInProperties.get(literalType).add(property); } } private void processTriples() throws Exception { int droppedEntityTriples = 0; int droppedLiteralTriples = 0; try (PrintWriter out = new PrintWriter(new FileOutputStream(basePathInput + "dropped_triples", false), true)) { System.out.println("Dropping triples with undefined elements"); for (int i = 0; i < entityTriplesSubjects.length; i++) { int sbj = entityTriplesSubjects[i]; int val = entityTriplesValues[i]; int attr = entityTriplesProperties[i]; if (entityUriWithPrefix[sbj] == null || entityUriWithPrefix[val] == null || propertyUri[attr] == null) {//if (entityUri[sbj] == null || entityUri[val] == null || propertyUri[attr] == null) { out.println( entityUriWithPrefix[sbj] + "\t" + propertyUri[attr] + "\t" + entityUriWithPrefix[val]); entityTriplesSubjects[i] = 0; droppedEntityTriples++; } else { propertyCount[attr]++; } } for (String type : literalTypes) { for (int i = 0; i < literalTriplesSubjects.get(type).length; i++) { int sbj = literalTriplesSubjects.get(type)[i]; int attr = literalTriplesProperties.get(type)[i]; if (entityUriWithPrefix[sbj] == null || propertyUri[attr] == null) {//if (entityUri[sbj] == null || propertyUri[attr] == null) { out.println(entityUriWithPrefix[sbj] + "\t" + propertyUri[attr]); literalTriplesSubjects.get(type)[i] = 0; droppedLiteralTriples++; } else { propertyCount[attr]++; } } } } catch (Exception e) { e.printStackTrace(); } System.out.println("Dropped " + droppedEntityTriples + " triples with entity value and " + droppedLiteralTriples + " with literal value"); System.out.println( "Scanning the triples to compute out-propertys of entities and in-propertys of entities and literal basic types"); //first compute out-propertys and in-propertys of entities //and in-propertys of basic types //<sbj, attr, val>, where val is an entity -> add attr to out-propertys of sbj and in-propertys of val int c = 0; for (int i = 0; i < entityTriplesSubjects.length; i++) { if (entityTriplesSubjects[i] == 0) { //it was previously dropped continue; } int sbj = entityTriplesSubjects[i]; int attr = entityTriplesProperties[i]; int val = entityTriplesValues[i]; if (entityUriWithPrefix[sbj] != null && propertyUri[attr] != null && entityUriWithPrefix[val] != null) {//if (entityUri[sbj] != null && propertyUri[attr] != null && entityUri[val] != null) { updateOutAndInEntityAndLiteralTypeProperties(attr, sbj, val, null); } c++; if (c % 1000000 == 0) { System.out.println("Processed " + (c / 1000000) + "M triples"); } } entityTriplesSubjects = null; entityTriplesProperties = null; entityTriplesValues = null; System.gc(); //<sbj, attr, type>, where type is a basic type -> add attr to out-propertys of sbj and in-propertys of type for (String literalType : literalTypes) { for (int i = 0; i < literalTriplesSubjects.get(literalType).length; i++) { if (literalTriplesSubjects.get(literalType)[i] == 0) { //it was previously dropped continue; } int sbj = literalTriplesSubjects.get(literalType)[i]; int attr = literalTriplesProperties.get(literalType)[i]; if (entityUriWithPrefix[sbj] != null && propertyUri[attr] != null) {//if (entityUri[sbj] != null && propertyUri[attr] != null) { updateOutAndInEntityAndLiteralTypeProperties(attr, sbj, null, literalType); propertyHasLiteralRange[attr] = true; } c++; if (c % 1000000 == 0) { System.out.println("Processed " + (c / 1000000) + "M triples"); } } } literalTriplesSubjects = null; literalTriplesProperties = null; System.gc(); System.out.println("Scanning the entity out-propertys to compute out-propertys of classes"); //now it is possible to compute the out-propertys and in-propertys of classes //entityOutProperties of e contains a -> add a to classOutProperties of all the classes of e classOutProperties = new HashSet[classUri.length]; for (int i = 1; i < entityOutProperties.length; i++) { if (entityOutProperties[i] != null && entityClasses[i] != null) { for (int property : entityOutProperties[i]) { for (int claz : entityClasses[i]) { if (classOutProperties[claz] == null) { classOutProperties[claz] = new HashSet<>(); } classOutProperties[claz].add(property); } } } } //System.out.println("Propagating the out-propertys to descendant classes"); //propagatePropertiesToDescendantClasses(classOutProperties, thingId); System.out.println("Propagating the out-propertys to ancestor classes"); propagatePropertiesToAncestorClasses(classOutProperties, thingId); if (printFiles) { System.out.println("Writing the classOutProperties"); try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "class_out_propertys", false), true)) { for (int i = 1; i < classOutProperties.length; i++) { if (i % 10 == 0) { out.flush(); } if (classOutProperties[i] != null) { out.print(classUri[i]); for (Integer a : classOutProperties[i]) { out.print("\t" + propertyUri[a]); } out.println(); /* System.out.print(classLabels[i] + " - outProperties: "); for (Integer a : classOutProperties[i]) { System.out.print("\t" + propertyLabels[a]); } System.out.println();*/ } } } } System.out.println("Scanning the entity in-propertys to compute in-propertys of classes"); //entityInProperties of e contains a -> add a to classInProperties of all the classes of e classInProperties = new HashSet[classUri.length]; for (int i = 1; i < entityInProperties.length; i++) { if (entityInProperties[i] != null && entityClasses[i] != null) { for (int property : entityInProperties[i]) { for (int claz : entityClasses[i]) { if (classInProperties[claz] == null) { classInProperties[claz] = new HashSet<>(); } classInProperties[claz].add(property); } } } } //System.out.println("Propagating the in-propertys to descendant classes"); //propagatePropertiesToDescendantClasses(classInProperties, thingId); System.out.println("Propagating the in-propertys to ancestor classes"); propagatePropertiesToAncestorClasses(classOutProperties, thingId); if (printFiles) { System.out.println("Writing the classInProperties"); try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "class_in_propertys", false), true)) { for (int i = 1; i < classInProperties.length; i++) { if (i % 10 == 0) { out.flush(); } if (classInProperties[i] != null) { out.print(classUri[i]); for (Integer a : classInProperties[i]) { out.print("\t" + propertyUri[a]); } out.println(); /* System.out.print(classLabels[i] + " - inProperties: "); for (Integer a : classInProperties[i]) { System.out.print("\t" + propertyLabels[a]); } System.out.println();*/ } } } } System.out.println("Scanning the triples to compute out- and in-propertys of propertys"); //now it is possible to compute the out-propertys and in-propertys of propertys //<t[0], t[1], t[2]> -> add t[1] to outProperties[property] for each property in entityInProperties[t[0]] /* OLD WAY c = 0; propertyOutProperties = new HashSet[propertyUri.length]; for (int[] t : entityTriples) { if (entityInProperties[t[0]] != null) { for (int property : entityInProperties[t[0]]) { if (propertyOutProperties[property] == null) { propertyOutProperties[property] = new HashSet<>(); } propertyOutProperties[property].add(t[1]); } } c++; if (c % 1000000 == 0) { System.out.println("Processed " + (c / 1000000) + "M triples"); } } for (LinkedList<int[]> lt : literalTriples.values()) { for (int[] t : lt) { if (entityInProperties[t[0]] != null) { for (int property : entityInProperties[t[0]]) { if (propertyOutProperties[property] == null) { propertyOutProperties[property] = new HashSet<>(); } propertyOutProperties[property].add(t[1]); } } c++; if (c % 1000000 == 0) { System.out.println("Processed " + (c / 1000000) + "M triples"); } } } */ for (int entity = 1; entity < entityInProperties.length; entity++) { if (entityInProperties[entity] != null) { for (int property : entityInProperties[entity]) { if (entityOutProperties[entity] != null && !entityOutProperties[entity].isEmpty()) { if (propertyOutProperties[property] == null) { propertyOutProperties[property] = new HashSet<>(); } propertyOutProperties[property].addAll(entityOutProperties[entity]); } if (propertyInProperties[property] == null) { propertyInProperties[property] = new HashSet<>(); } propertyInProperties[property].addAll(entityInProperties[entity]); } } } //I will use the literalTypesInProperties when I index the property with rangeOf /* for (String type : literalTypes) { for (int property : literalTypesInProperties.get(type)) { if (propertyInProperties[property] == null) { propertyInProperties[property] = new HashSet<>(); } propertyInProperties[property].addAll(literalTypesInProperties.get(type)); } } */ //System.out.println("Scanning the triples to compute in-propertys of propertys"); //<t[0], t[1], t[2]> -> add t[1] to inProperties[property] for each property in entityInProperties[t[2]] /*OLD WAY c = 0; propertyInProperties = new HashSet[propertyUri.length]; for (int[] t : entityTriples) { if (entityInProperties[t[2]] != null) { for (int property : entityInProperties[t[2]]) { if (propertyInProperties[property] == null) { propertyInProperties[property] = new HashSet<>(); } propertyInProperties[property].add(t[1]); } } c++; if (c % 1000000 == 0) { System.out.println("Processed " + (c / 1000000) + "M triples"); } } //<t[0], t[1], type> -> add t[1] to inProperties[property] for each property in literalValueInProperties[type] //it is possible to avoid the scan of the triples, since we already know the inProperty of each literal basic type for (String type : literalTriples.keySet()) { for (int property1 : literalTypesInProperties.get(type)) { if (propertyInProperties[property1] == null) { propertyInProperties[property1] = new HashSet<>(); } for (int property2 : literalTypesInProperties.get(type)) { propertyInProperties[property1].add(property2); } } } */ //write the in/Out-Entity/Class-Properties if (printFiles) { System.out.println("Writing the entityInProperties"); try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "entity_in_propertys", false), true)) { for (int i = 1; i < entityInProperties.length; i++) { if (i % 100 == 0) { out.flush(); } if (entityInProperties[i] != null) { out.print(entityUriWithPrefix[i]);//out.print(entityUri[i]); for (Integer a : entityInProperties[i]) { out.print("\t" + propertyUri[a]); } out.println(); if (i % 100000 == 0) { System.out.print(entityLabels[i] + " - inProperties: "); for (Integer a : entityInProperties[i]) { System.out.print("\t" + propertyLabels[a]); } System.out.println(); } } } } } if (printFiles) { System.out.println("Writing the entityOutProperties"); try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "entity_out_propertys", false), true)) { for (int i = 1; i < entityOutProperties.length; i++) { if (i % 100 == 0) { out.flush(); } if (entityOutProperties[i] != null) { out.print(entityUriWithPrefix[i]);//out.print(entityUri[i]); for (Integer a : entityOutProperties[i]) { out.print("\t" + propertyUri[a]); } out.println(); if (i % 100000 == 0) { System.out.print(entityLabels[i] + " - outProperties: "); for (Integer a : entityOutProperties[i]) { System.out.print("\t" + propertyLabels[a]); } System.out.println(); } } } } } //write the literalInProperties if (printFiles) { try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "literal_types_in_propertys", false), true)) { for (Map.Entry<String, HashSet<Integer>> e : literalTypesInProperties.entrySet()) { out.print(e.getKey()); for (Integer a : e.getValue()) { out.print("\t" + propertyUri[a]); } out.println(); /* System.out.print(e.getKey() + " - inProperties: "); for (Integer a : e.getValue()) { System.out.print("\t" + propertyLabels[a]); } System.out.println(); */ } } } if (printFiles) { try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "property_in_propertys", false), true)) { for (int property = 1; property < propertyUri.length; property++) { if (property % 10 == 0) { out.flush(); } if (propertyInProperties[property] != null && !propertyInProperties[property].isEmpty()) { out.print(propertyUri[property]); for (Integer a : propertyInProperties[property]) { out.print("\t" + propertyUri[a]); } out.println(); /* System.out.print(propertyLabels[property] + " - inProperties: "); for (Integer a : propertyInProperties[property]) { System.out.print("\t" + propertyLabels[a]); } System.out.println(); */ } } } } if (printFiles) { try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "property_out_propertys", false), true)) { for (int property = 1; property < propertyUri.length; property++) { if (property % 10 == 0) { out.flush(); } if (propertyOutProperties[property] != null && !propertyOutProperties[property].isEmpty()) { out.print(propertyUri[property]); for (Integer a : propertyOutProperties[property]) { out.print("\t" + propertyUri[a]); } out.println(); /* System.out.print(propertyLabels[property] + " - outProperties: "); for (Integer a : propertyOutProperties[property]) { System.out.print("\t" + propertyLabels[a]); } System.out.println(); */ } } } } } private static void indexOntologyElement(IndexWriter writer, OntologyElementToken e, Collection<String> domainOf, Collection<String> rangeOf, Collection<String> extendedDomain) throws Exception { Document doc = new Document(); doc.add(new Field("label", e.getLabel(), TextField.TYPE_NOT_STORED)); doc.add(new IntField("id", e.getId(), IntField.TYPE_STORED)); doc.add(new Field("type", e.getType(), StringField.TYPE_NOT_STORED)); if (domainOf != null) { for (String d : domainOf) { //the first element is the URI doc.add(new Field("domainOfProperty", d, StringField.TYPE_NOT_STORED)); } } if (rangeOf != null) { for (String r : rangeOf) { //the first element is the URI doc.add(new Field("rangeOfProperty", r, StringField.TYPE_NOT_STORED)); } } if (extendedDomain != null) { for (String d : extendedDomain) { //the first element is the URI doc.add(new Field("propertyDomain", d, StringField.TYPE_NOT_STORED)); } } writer.addDocument(doc); } /* private static void indexNonOntologyElement(IndexWriter writer, IndexedToken e, Collection<String> domainOf, Collection<String> rangeOf, Collection<String> extendedDomain) throws Exception { Document doc = new Document(); doc.add(new Field("label", e.getText(), TextField.TYPE_NOT_STORED)); doc.add(new Field("id", e.getId(), TextField.TYPE_STORED)); doc.add(new Field("type", e.getType(), TextField.TYPE_NOT_STORED)); if (domainOf != null) { for (String d : domainOf) { //the first element is the URI doc.add(new Field("domainOfProperty", d, StringField.TYPE_NOT_STORED)); } } if (rangeOf != null) { for (String r : rangeOf) { //the first element is the URI doc.add(new Field("rangeOfProperty", r, StringField.TYPE_NOT_STORED)); } } if (extendedDomain != null) { for (String d : extendedDomain) { //the first element is the URI doc.add(new Field("propertyDomain", d, StringField.TYPE_NOT_STORED)); } } writer.addDocument(doc); } */ private void indexEntities(IndexWriter writer, HashMap<Integer, IndexedToken> elements) throws Exception { for (int i = 1; i < entityUriWithPrefix.length; i++) {//for (int i = 1; i < entityUri.length; i++) { if (entityUriWithPrefix[i] != null) {//if (entityUri[i] != null) { HashSet<String> domainOf = new HashSet<>(); HashSet<String> rangeOf = new HashSet<>(); if (entityOutProperties[i] != null) { for (int a : entityOutProperties[i]) { domainOf.add(propertyUri[a]); } } if (entityInProperties[i] != null) { for (int a : entityInProperties[i]) { rangeOf.add(propertyUri[a]); } } for (String label : entityLabels[i]) { EntityToken element = new EntityToken(getEntityCompleteUri(entityUriWithPrefix[i]), label, false);//EntityToken element = new EntityToken(entityUri[i], label); indexOntologyElement(writer, element, domainOf, rangeOf, null); elements.put(element.getId(), element); } } } entityOutProperties = null; entityInProperties = null; System.gc(); } private void indexClasses(IndexWriter writer, HashMap<Integer, IndexedToken> elements) throws Exception { HashSet<Character> vowels = new HashSet<>(); vowels.add('a'); vowels.add('e'); vowels.add('i'); vowels.add('o'); vowels.add('u'); for (int i = 1; i < classUri.length; i++) { if (classUri[i] != null) { HashSet<String> domainOf = new HashSet<>(); HashSet<String> rangeOf = new HashSet<>(); if (classOutProperties[i] == null) { classOutProperties[i] = new HashSet<>(); } for (int a : classOutProperties[i]) { domainOf.add(propertyUri[a]); } if (classInProperties[i] == null) { classInProperties[i] = new HashSet<>(); } for (int a : classInProperties[i]) { rangeOf.add(propertyUri[a]); } for (String label : classLabels[i]) { label = label.toLowerCase(); ClassToken elementSingular = new ClassToken(classUri[i], label, IndexedToken.SINGULAR, false); indexOntologyElement(writer, elementSingular, domainOf, rangeOf, null); elements.put(elementSingular.getId(), elementSingular); //now create the plural form String pLabel; if (label.endsWith("y") && !vowels.contains(label.charAt(label.length() - 2))) { pLabel = label.substring(0, label.length() - 1) + "ies"; } else if (label.endsWith("s") || label.endsWith("sh") || label.endsWith("ch") || label.endsWith("x") || label.endsWith("z")) { pLabel = label + "es"; } else if (label.equals("person")) { pLabel = "people"; } else { pLabel = label + "s"; } ClassToken elementPlural = new ClassToken(classUri[i], pLabel, IndexedToken.PLURAL, false); indexOntologyElement(writer, elementPlural, domainOf, rangeOf, null); elements.put(elementPlural.getId(), elementPlural); } } } } private void indexProperties(IndexWriter writer, HashMap<Integer, IndexedToken> elements) throws Exception { //precompute the domains of propertys HashSet<String>[] propertyDomains = new HashSet[propertyUri.length]; //the domain of an property a is the set of classes and propertys having a in their outProperty for (int claz = 1; claz < classOutProperties.length; claz++) { if (classOutProperties[claz] != null && classUri[claz] != null) { for (int a : classOutProperties[claz]) { if (propertyDomains[a] == null) { propertyDomains[a] = new HashSet<>(); } propertyDomains[a].add(classUri[claz]); } } } for (int property = 1; property < propertyOutProperties.length; property++) { if (propertyOutProperties[property] != null) { for (int a : propertyOutProperties[property]) { if (propertyDomains[a] == null) { propertyDomains[a] = new HashSet<>(); } propertyDomains[a].add(propertyUri[property]); } } } //precompute the literal ranges of every property HashSet<String>[] propertyLiteralRanges = new HashSet[propertyUri.length]; for (int i = 1; i < propertyLiteralRanges.length; i++) { propertyLiteralRanges[i] = new HashSet<>(); } for (String literalType : literalTypesInProperties.keySet()) { for (int property : literalTypesInProperties.get(literalType)) { propertyLiteralRanges[property].add(literalType); } } for (int property = 1; property < propertyUri.length; property++) { if (propertyUri[property] != null) { HashSet<String> domainOf = new HashSet<>(); if (propertyOutProperties[property] != null) { for (int a : propertyOutProperties[property]) { domainOf.add(propertyUri[a]); } } HashSet<String> rangeOf = new HashSet<>(); if (propertyInProperties[property] != null) { for (int a : propertyInProperties[property]) { rangeOf.add(propertyUri[a]); } } for (String type : literalTypes) { if (literalTypesInProperties.get(type).contains(property)) { for (int a : literalTypesInProperties.get(type)) { rangeOf.add(propertyUri[a]); } } } for (String label : propertyLabels[property]) { HashSet<String> aDomains = propertyDomains[property]; PropertyToken element = new PropertyToken(propertyUri[property], label, IndexedToken.UNDEFINED, IndexedToken.UNDEFINED, propertyOutProperties[property] != null && !propertyOutProperties[property].isEmpty(), propertyHasLiteralRange[property], false); indexOntologyElement(writer, element, domainOf, rangeOf, aDomains); element.setPropertyAndClassDomain(aDomains); element.addBasicTypeRanges(propertyLiteralRanges[property]); elements.put(element.getId(), element); } } } classOutProperties = null; classInProperties = null; propertyOutProperties = null; propertyInProperties = null; System.gc(); } public void start() throws Exception { long t = System.currentTimeMillis(); loadBasicTypesMapping(); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); loadTriples(); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); loadPropertyLabels(); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); loadClassLabels(); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); loadClassHierarchy(); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); loadEntityLabels(); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); loadEntityClasses(); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); entityIdFromUriWithPrefix = null;//entityIdFromUri = null; classIdFromUri = null; propertyIdFromUri = null; System.gc(); processTriples(); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); HashMap<String, Analyzer> analyzerMap = new HashMap<>(); analyzerMap.put("label", new EnglishAnalyzer(CharArraySet.EMPTY_SET)); analyzerMap.put("id", new WhitespaceAnalyzer()); analyzerMap.put("type", new WhitespaceAnalyzer()); analyzerMap.put("domainOfProperty", new WhitespaceAnalyzer()); analyzerMap.put("rangeOfProperty", new WhitespaceAnalyzer()); analyzerMap.put("propertyDomain", new WhitespaceAnalyzer()); Analyzer analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), analyzerMap); HashMap<Integer, IndexedToken> elements = new HashMap<>(); try (FSDirectory directory = FSDirectory.open(Paths.get(basePathOutput + "lucene"))) { IndexWriterConfig iwc = new IndexWriterConfig(analyzer); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); try (IndexWriter writer = new IndexWriter(directory, iwc)) { System.out.println("Indexing entities"); indexEntities(writer, elements); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); System.out.println("Indexing classes"); indexClasses(writer, elements); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); System.out.println("Indexing propertys"); indexProperties(writer, elements); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); /* System.out.println("Indexing constraint uriToPrefix"); indexConstraintPrefixes(writer, elements); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); */ } catch (Exception e) { e.printStackTrace(); } } //save elements to file System.out.println("Creating the trie"); Trie trie = new Trie(); int c = 0; for (IndexedToken it : elements.values()) { trie.add(it.getText()); c++; if (c % 100000 == 0) { System.out.println(c + " elements added to the trie"); } } System.out.println(c + " elements added to the trie"); c = 0; for (IndexedToken it : elements.values()) { String suffix = trie.getOneSuffix(it.getText()); if (suffix != null) { it.setPrefix(true); c++; } } System.out.println(c + " are prefix of another element"); System.out.println("Serializing the tokens"); try (ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(basePathOutput + "elements"))) { oos.writeObject(elements); oos.writeInt(IndexedToken.counter); } } public static void main(String... args) throws Exception { String fn1 = null, fn2 = null; if (args != null && args.length == 2) { fn1 = args[0]; fn2 = args[1]; } else { //fn1 = "/home/massimo/aquawd/dbpedia-ontology-extended-files/"; //fn2 = "/home/massimo/aquawd/processed-dbpedia-ontology-extended/"; //fn1 = "/home/massimo/aquawd/dbpedia-ontology-files/"; //fn2 = "/home/massimo/aquawd/processed-dbpedia-ontology/"; //fn1 = "/home/massimo/aquawd/musicbrainz-old-ontology-files/"; //fn2 = "/home/massimo/aquawd/processed-musicbrainz-old-ontology/"; //fn1 = "/home/massimo/aquawd/biomedical-ontology-files/"; //fn2 = "/home/massimo/aquawd/processed-biomedical-ontology/"; //fn1 = "/home/massimo/canalikbs/dbpedia/2015-10/processed/"; //fn2 = "/home/massimo/canalikbs/dbpedia/2015-10/index_new/"; //fn1 = "/home/massimo/canalikbs/musicbrainz/qald3/processed/"; //fn2 = "/home/massimo/canalikbs/musicbrainz/qald3/index/"; //fn1 = "/home/massimo/canalikbs/biomedical/qald4/processed/"; //fn2 = "/home/massimo/canalikbs/biomedical/qald4/index/"; fn1 = "/home/massimo/canalikbs/movie/processed/"; fn2 = "/home/massimo/canalikbs/movie/index/"; } long start = System.currentTimeMillis(); System.out.println("Started at " + new Date()); new BuildIndex(fn1, fn2).start(); System.out.println("Ended at " + new Date()); long time = System.currentTimeMillis() - start; long sec = time / 1000; System.out.println("The process took " + (sec / 60) + "'" + (sec % 60) + "." + (time % 1000) + "\""); } }