ThesaurusTreatment.ThesaurusProcess.java Source code

Java tutorial

Introduction

Here is the source code for ThesaurusTreatment.ThesaurusProcess.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package ThesaurusTreatment;

import Disambiguator.WordNetDesambiguisator;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.logging.Level;
import java.util.logging.Logger;
import net.didion.jwnl.JWNLException;
import net.sf.json.JSONObject;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import t2kb.SparqlProxy;

/**
 *
 * @author murloc
 */
public class ThesaurusProcess {
    private SparqlProxy spIn;
    private SparqlProxy spOut;

    HashMap<String, String> ranksAlign;

    String adomFileName;

    public ThesaurusProcess(String urlServerIn, String urlServerOut, String adomFileName) {
        ranksAlign = new HashMap<>();
        ranksAlign.put("http://aims.fao.org/aos/agrovoc/c_330935",
                "http://ontology.irstea.fr/AgronomicTaxon#ClassRank");
        ranksAlign.put("http://aims.fao.org/aos/agrovoc/c_330937",
                "http://ontology.irstea.fr/AgronomicTaxon#FamilyRank");
        ranksAlign.put("http://aims.fao.org/aos/agrovoc/c_11125",
                "http://ontology.irstea.fr/AgronomicTaxon#GenusRank");
        ranksAlign.put("http://aims.fao.org/aos/agrovoc/c_330934",
                "http://ontology.irstea.fr/AgronomicTaxon#KingdomRank");
        ranksAlign.put("http://aims.fao.org/aos/agrovoc/c_330936",
                "http://ontology.irstea.fr/AgronomicTaxon#OrderRank");
        ranksAlign.put("http://aims.fao.org/aos/agrovoc/c_330950",
                "http://ontology.irstea.fr/AgronomicTaxon#PhylumRank");
        ranksAlign.put("http://aims.fao.org/aos/agrovoc/c_331243",
                "http://ontology.irstea.fr/AgronomicTaxon#SpecyRank");
        ranksAlign.put("http://aims.fao.org/aos/agrovoc/c_8157",
                "http://ontology.irstea.fr/AgronomicTaxon#VarietyRank");

        this.adomFileName = adomFileName;

        this.spIn = SparqlProxy.getSparqlProxy(urlServerIn);
        this.spOut = SparqlProxy.getSparqlProxy(urlServerOut);
        System.out.println("Clearing ouput KB...");
        this.spOut.storeData(new StringBuilder("DELETE WHERE {?a ?b ?c}")); // clean the output sparql endpoint before adding contents
    }

    public String getADOMTtl() {

        //        String ret = "prefix : <http://www.w3.org/2002/07/owl#> \n" +
        //"prefix owl: <http://www.w3.org/2002/07/owl#> \n" +
        //"prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> \n" +
        //"prefix xml: <http://www.w3.org/XML/1998/namespace> \n" +
        //"prefix xsd: <http://www.w3.org/2001/XMLSchema#> \n" +
        //"prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> ";

        String ret = "prefix : <http://www.w3.org/2002/07/owl#> \n"
                + "prefix owl: <http://www.w3.org/2002/07/owl#> \n"
                + "prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> \n"
                + "prefix xml: <http://www.w3.org/XML/1998/namespace> \n"
                + "prefix xsd: <http://www.w3.org/2001/XMLSchema#> \n"
                + "prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> \n"
                + "prefix skos: <http://www.w3.org/2004/02/skos/core#> \n"
                + "prefix swrl: <http://www.w3.org/2003/11/swrl#> \n"
                + "prefix swrlb: <http://www.w3.org/2003/11/swrlb#> \n"
                + "prefix terms: <http://purl.org/dc/terms/> \n"
                + "prefix AgronomicTaxon: <http://ontology.irstea.fr/AgronomicTaxon#> \n"
                + "base <http://ontology.irstea.fr/AgronomicTaxon> \n";

        ret += "INSERT DATA {";

        try {
            ret += IOUtils.toString(new FileInputStream(new File(this.adomFileName)));
            //ret = ret.replaceAll("^@.+\\.$", "");   // remove Prefix (wrong syntax for SPARQL insert query)
        } catch (IOException ex) {
            System.err.println("Can't read adom file!");
            System.exit(0);
        }

        ret += "}";
        return ret;
    }

    public boolean isAcceptLang(String lang) {
        return ((lang.compareToIgnoreCase("en") == 0) || (lang.compareToIgnoreCase("fr") == 0)
                || (lang.compareToIgnoreCase("de") == 0) || (lang.compareToIgnoreCase("es") == 0));
    }

    public void loadAllConcepts(String limitUri) {
        /*System.out.println("Initialize WordNet ...");
        WordNetDesambiguisator wnd = WordNetDesambiguisator.getWordNetConnector();
        System.out.println("WordNet initialized!");*/

        System.out.println("Exporting ADOM ...");
        if (!this.spOut.storeData(new StringBuilder(this.getADOMTtl()), false)) {
            System.out.println("ERROR during exporting ADOM");
            System.exit(0);
        }
        System.out.println("ADOM exported!");

        //ArrayList<JSONObject> listURI = this.spIn.sendQuery("SELECT * WHERE{?c rdf:type skos:Concept. ?c skos:broader+ <http://aims.fao.org/aos/agrovoc/c_5993>.}"); //toutes les plantes
        ArrayList<JSONObject> listURI = this.spIn
                .sendQuery("SELECT * WHERE{?c rdf:type skos:Concept. ?c skos:broader+ <" + limitUri + ">.}");
        listURI.addAll(this.spIn
                .sendQuery("SELECT * WHERE {?c rdf:type skos:Concept.  <" + limitUri + "> skos:broader+ ?c.}"));
        JSONObject joMain = new JSONObject();
        JSONObject cUri = new JSONObject();
        cUri.put("value", limitUri);
        joMain.put("c", cUri);
        System.out.println(joMain);
        listURI.add(joMain);
        System.out.println(listURI.size() + " concepts  import");
        System.out.println("Begin creating raw KB ...");
        int i = 0;
        int nbQuery = 0;
        int nbSubClassOf = 0;

        String baseUri = "http://ontology.irstea.fr/AGROVOCTaxon#";

        //String stringStats = " Uri sujet ; label franais ; relation ; WordNet; Uri objet ; label franais ; Validation Catherine; Wikipedia; NCBI; note";

        // def relations
        HashMap<String, OntologyRelation> objProp = new HashMap<>();

        StringBuilder updateQuery = new StringBuilder("INSERT DATA {");

        String classifUri = baseUri + "classification_AGROVOC";

        updateQuery.append("<" + classifUri
                + "> rdf:type <http://ontology.irstea.fr/AgronomicTaxon#Taxonomy>; rdfs:label \"AGROVOC agronomic classification\".");

        for (JSONObject s : listURI) {

            String currentQueryPart = "";
            String currentObjPropQuerypart = "";

            String uri = s.getJSONObject("c").getString("value");

            String rankUri = "http://ontology.irstea.fr/AgronomicTaxon#Taxon";
            ArrayList<JSONObject> elemRanks = this.spIn
                    .sendQuery("SELECT * WHERE {<" + uri + "> agrovoc:hasTaxonomicLevel ?rankUri.}");
            if (!elemRanks.isEmpty()) {
                String uriInRank = elemRanks.get(0).getJSONObject("rankUri").getString("value");
                rankUri = this.ranksAlign.get(uriInRank);
                if (rankUri == null) {
                    //rankUri = "http://ontology.irstea.fr/AgronomicTaxon#Taxon";
                    this.ranksAlign.put(uriInRank, uriInRank);
                    currentQueryPart += "<" + uriInRank
                            + "> rdf:type owl:Class; rdfs:subClassOf  <http://ontology.irstea.fr/AgronomicTaxon#Taxon>;  ";
                    ArrayList<JSONObject> labelsClass = this.spIn.sendQuery("SELECT * WHERE {<" + uriInRank
                            + ">  (<http://www.w3.org/2004/02/skos/core#prefLabel>|<http://www.w3.org/2004/02/skos/core#altLabel>) ?label}");
                    for (JSONObject sLabel : labelsClass) {
                        String lang = sLabel.getJSONObject("label").getString("xml:lang");
                        String label = SparqlProxy.cleanString(sLabel.getJSONObject("label").getString("value"));
                        //currentQueryPart += " rdfs:label ";
                        if (lang.isEmpty()) {
                            currentQueryPart += " rdfs:label  \" " + label + "\"; ";
                        } else if (this.isAcceptLang(lang)) {
                            currentQueryPart += " rdfs:label  \"" + label + "\"@" + lang + "; ";
                        }
                    }

                    currentQueryPart = currentQueryPart.substring(0, currentQueryPart.lastIndexOf(";"));
                    currentQueryPart += ".";
                    rankUri = uriInRank;
                }
            }

            currentQueryPart += "<" + uri + "> rdf:type <" + rankUri
                    + ">; <http://ontology.irstea.fr/AgronomicTaxon#inScheme> <" + classifUri + ">; ";

            ArrayList<JSONObject> listRel = this.spIn.sendQuery("SELECT * WHERE {<" + uri + "> ?rel ?val.}");
            for (JSONObject sRel : listRel) {
                String rel = sRel.getJSONObject("rel").getString("value");
                String val = sRel.getJSONObject("val").getString("value");
                if (SparqlProxy.isLabelRel(rel)) {
                    String lang = sRel.getJSONObject("val").getString("xml:lang");
                    String label = SparqlProxy.cleanString(val);
                    String adomRelUri = "http://ontology.irstea.fr/AgronomicTaxon#hasVernacularName";
                    if (rel.equalsIgnoreCase("http://www.w3.org/2004/02/skos/core#prefLabel")) {
                        adomRelUri = "http://ontology.irstea.fr/AgronomicTaxon#hasScientificName";
                    }

                    if (!label.isEmpty()) {
                        if (lang.isEmpty()) {
                            currentQueryPart += " <" + adomRelUri + "> \"" + SparqlProxy.cleanString(val) + "\";";
                            currentQueryPart += " rdfs:label  \"" + SparqlProxy.cleanString(val) + "\";";
                        } else if (this.isAcceptLang(lang)) {
                            currentQueryPart += " <" + adomRelUri + "> \"" + SparqlProxy.cleanString(val) + "\"@"
                                    + lang + ";";
                            currentQueryPart += " rdfs:label \"" + SparqlProxy.cleanString(val) + "\"@" + lang
                                    + ";";
                        }
                    }
                } else if (SparqlProxy.isSubRel(rel)) {
                    nbSubClassOf++;
                    currentQueryPart += " <http://ontology.irstea.fr/AgronomicTaxon#hasHigherRank>   <" + val
                            + ">; ";
                    /* REMOVE WordNet desambiguator
                    Boolean isSubClassOf = this.isSubClassOf(wnd, uri, val);
                    if(isSubClassOf == null || isSubClassOf)
                    {
                    currentQueryPart += " rdfs:subClassOf  <"+val+">; ";
                    if(isSubClassOf == null)
                    {
                        nbSubClassOfNotFoundWN++;
                        stringStats += "\n"+this.getConceptsStatsDescription(uri, val, "Not found");
                    }
                    else if(isSubClassOf)
                    {
                        nbSubClassOfValidated ++;
                        stringStats += "\n"+this.getConceptsStatsDescription(uri, val, "Validated");
                    }
                    }
                    else
                    {
                    nbSubClassOfNotValidated++;
                    stringStats += "\n"+this.getConceptsStatsDescription(uri, val, "Not validated");
                    }*/
                }
                /*else if(!(SparqlProxy.isExcludeRel(rel)) && val.startsWith("http://aims.fao.org/aos/agrovoc/"))
                {
                OntologyRelation or = objProp.get(rel);
                if(or == null)
                {
                    or = new OntologyRelation(rel, this.spOut);
                    objProp.put(rel, or);
                }
                String subProp = or.addRangeDomain(uri, val);
                currentObjPropQuerypart += subProp;
                }*/

            }
            currentQueryPart = currentQueryPart.substring(0, currentQueryPart.lastIndexOf(";"));
            currentQueryPart += ".";
            currentQueryPart += currentObjPropQuerypart;

            int fullLength = updateQuery.length() + currentQueryPart.length();
            if (fullLength > 4000000) // limit for the fuseki update query length
            {
                nbQuery++;
                updateQuery.append("}");
                boolean ret = this.spOut.storeData(updateQuery);
                System.out.println(i + " concepts treated (query n " + nbQuery + ")...");
                if (!ret) //if store query bugged
                    System.exit(0);
                updateQuery = new StringBuilder("INSERT DATA {" + currentQueryPart);
            } else {
                updateQuery.append(currentQueryPart);
            }
            i++;
        }

        if (!updateQuery.equals("INSERT DATA {")) {
            nbQuery++;
            updateQuery.append("}");
            this.spOut.storeData(updateQuery);
            System.out.println(i + " concepts treated (query n " + nbQuery + ")...");
        }

        System.out.println("Begin object properties definitions (" + objProp.values().size() + " objProps) ...");
        int nbObjProp = 0;
        nbQuery = 0;
        int nbObjPropCached = 0;
        updateQuery = new StringBuilder("INSERT DATA {");

        for (OntologyRelation or : objProp.values()) {
            StringBuilder currentQueryPart = new StringBuilder("");
            nbObjProp++;
            currentQueryPart.append(or.toTtl(this.spOut));
            //currentQueryPart.append(or.getSubPropertiesTtl());

            int fullLength = updateQuery.length() + currentQueryPart.length();
            if (fullLength > 4000000) // limit for the fuseki limit POST request
            {
                nbQuery++;
                updateQuery.append("}");
                boolean ret = this.spOut.storeData(updateQuery);
                System.out.println(nbObjProp + " properties treated (query n " + nbQuery + ")...");
                if (!ret) //if store query bugged
                    System.exit(0);
                updateQuery = new StringBuilder("INSERT DATA {" + currentQueryPart);
                nbObjPropCached = 0;
            } else {
                updateQuery.append(currentQueryPart);
            }
            nbObjPropCached++;
            System.out.println(nbObjPropCached + " Object properties definitions cached");
        }
        if (!updateQuery.equals("INSERT DATA {")) {
            nbQuery++;
            updateQuery.append("}");
            this.spOut.storeData(updateQuery);
            System.out.println(nbObjProp + " properties treated (query n " + nbQuery + ")...");
        }

        System.out.println("Raw KB (" + i + " concepts || " + nbObjProp + " objProps) created!");

        /*  System.out.println("------------------ \n");
          String s = "Nb subClassOf discovered :  "+nbSubClassOf+" \n";
          s += "Nb subClassOf validated by WN : "+nbSubClassOfValidated+"\n";
          s += "Nb subClassOf not found in WN : "+nbSubClassOfNotFoundWN+"\n";
          s += "Nb subClassOf not validated by WN : "+nbSubClassOfNotValidated+"\n";
          System.out.println(s);
          File fStatsOut = new File("out/Stats_out.txt");
          File fileStat = new File("out/stats_WordNet.csv");
          try 
          {
        FileUtils.writeStringToFile(fStatsOut, s, "UTF8", false);
        FileUtils.writeStringToFile(fileStat, stringStats);
          } 
          catch (IOException ex) 
          {
        System.out.println("EROR during output stats out file ");
        System.exit(0);
          }*/

    }

    /* SPARQL query save (to facilitate copy/paste)
    SELECT * WHERE {?a ?b ?c} LIMIT 10
    DELETE {?a ?b ?c} WHERE {?a ?b ?c}
        
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX owl:    <http://www.w3.org/2002/07/owl#>
    SELECT COUNT(?c) WHERE{?c rdf:type owl:Class.}
        
        
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX owl:    <http://www.w3.org/2002/07/owl#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    ASK {<http://aims.fao.org/aos/agrovoc/c_7955> rdfs:subClassOf* <http://aims.fao.org/aos/agrovoc/c_3354>.}
        
    */

    public Boolean isSubClassOf(WordNetDesambiguisator wnd, String uri1, String uri2) {
        Boolean ret = null;
        ArrayList<String> labelsUri1 = new ArrayList<>();
        ArrayList<String> labelsUri2 = new ArrayList<>();

        ArrayList<JSONObject> retLabels = this.spIn.sendQuery("SELECT * WHERE {<" + uri1
                + "> skos:prefLabel ?label .   FILTER( LANGMATCHES(LANG(?label), \"en\")). }");
        for (JSONObject jsono : retLabels) {
            labelsUri1.add(jsono.getJSONObject("label").getString("value"));
        }

        retLabels = this.spIn.sendQuery("SELECT * WHERE {<" + uri2
                + "> skos:prefLabel ?label .   FILTER( LANGMATCHES(LANG(?label), \"en\")). }");
        for (JSONObject jsono : retLabels) {
            labelsUri2.add(jsono.getJSONObject("label").getString("value"));
        }
        //System.out.println("INIT CARTESIAN : "+labelsUri1.size()+" X "+labelsUri2.size());
        int i = 0, j = 0;
        //Cartesian product to test all english terms couple
        while ((ret == null || !ret) && i < labelsUri1.size()) {
            String term1 = labelsUri1.get(i);
            if (wnd.isInWordNet(term1)) {
                while ((ret == null || !ret) && j < labelsUri2.size()) {
                    String term2 = labelsUri2.get(j);
                    if (wnd.isInWordNet(term2)) {
                        ret = wnd.getRelation(term1, term2).equalsIgnoreCase("subClassOf");
                        //System.out.println("relation between : "+term1+" ---"+ret+"---> "+term2);
                    } else {
                        //System.out.println("Term not found in WN (2): "+term2);
                    }
                    j++;
                }
            } else {
                //System.out.println("Term not found in WN : "+term1);
            }
            i++;
        }

        //System.out.println("Return : "+ret);
        return ret;
    }

    public void exportKBToFile(String fileName) {
        this.spOut.writeKBFile(fileName);
    }

    private ArrayList<String> getAllLabel(String uri, String lang) {
        ArrayList<String> ret = new ArrayList<>();

        String query = "SELECT * WHERE { <" + uri + "> (skos:prefLabel | skos:altLabel) ?label.";
        if (lang != null) {
            query += "FILTER ( lang(?label) = \"fr\" )";
        }
        query += "}";
        ArrayList<JSONObject> labels = this.spIn.sendQuery(query);
        for (JSONObject jsono : labels) {
            ret.add(jsono.getJSONObject("label").getString("value"));
        }
        return ret;
    }

    private ArrayList<String> getAllLabelsLang(String uri) {
        ArrayList<String> sLabels = this.getAllLabel(uri, "fr");
        if (sLabels.isEmpty()) {
            sLabels = this.getAllLabel(uri, "en");
        }
        if (sLabels.isEmpty()) {
            sLabels = this.getAllLabel(uri, null);
        }
        return sLabels;
    }

    private String getConceptsStatsDescription(String uriS, String uriO, String status) {
        String ret = "";

        String subjectLabels = "";
        for (String s : this.getAllLabelsLang(uriS)) {
            subjectLabels += s + ", ";
        }

        String objectLabels = "";
        for (String s : this.getAllLabelsLang(uriO)) {
            objectLabels += s + ", ";
        }

        ret = uriS + " ; " + subjectLabels + "; subClassOf ; " + status + "; " + uriO + "; " + objectLabels
                + "; ; ; ; ";

        return ret;
    }

}