kmi.taa.core.Crawler.java Source code

Introduction

Here is the source code for kmi.taa.core.Crawler.java
Source

/*
 * (C) Copyright 2017 Shuangyan Liu
 * Shuangyan.Liu@open.ac.uk 
 * Knowledge Media Institute
 * The Open University, United Kingdom
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package kmi.taa.core;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.http.client.ClientProtocolException;
import org.apache.log4j.Logger;
import org.nnsoft.sameas4j.DefaultSameAsServiceFactory;
import org.nnsoft.sameas4j.Equivalence;
import org.nnsoft.sameas4j.SameAsService;
import org.nnsoft.sameas4j.SameAsServiceException;

public class Crawler {
    final static Logger log = Logger.getLogger(Crawler.class.getName());

    public static String[] crawlsaspty(String res, String service, String proxy)
            throws ClientProtocolException, IOException {
        /*
         *  The String query is the SPARQL query used to get the owl:sameAs subject links
         *  from the knowledge graph where the source triples are from.
         */
        String query = "prefix owl: <http://www.w3.org/2002/07/owl#>" + System.getProperty("line.separator")
                + "select ?obj" + System.getProperty("line.separator") + "where {"
                + System.getProperty("line.separator") + "<" + res + "> owl:sameAs ?obj . "
                + System.getProperty("line.separator") + "}";

        // converting a String to the application/x-www-form-urlencoded MIME format
        String get = service + "?query=" + URLEncoder.encode(query, "utf-8") + "&output=csv";

        SPARQLHTTPClient client = new SPARQLHTTPClient();
        String slinks = client.httpGet(get, proxy);

        /*
         * remove the first line of slinks, i.e., "obj"
         * also removes the double quotes wrapped around uris 
         */
        String[] preformated = slinks.split(System.getProperty("line.separator"));
        String[] formated = new String[preformated.length - 1];
        int j = 0;
        for (int i = 0; i < preformated.length; i++) {
            if (preformated[i].matches("\"http\\S+\"")) {
                formated[j++] = preformated[i].substring(1, preformated[i].length() - 1);
            }
        }

        return formated;

    }

    public Equivalence sameAsService(String res) throws URISyntaxException {
        SameAsService sameAsService = DefaultSameAsServiceFactory.createNew();
        Equivalence equivalence = null;
        try {
            equivalence = sameAsService.getDuplicates(new URI(res));
        } catch (SameAsServiceException e) {
            log.error("SameAsService error", e);
        }

        return equivalence;

    }

    /**
     * Read source triples into a TreeMap
     * TreeMap is RBTree based, the map is sorted according to the natural 
     * ordering of the keys, time complexity is O(lg(n)).
     * However, HashMap does not keep the ordering of keys in addition order,
     * we want to keep the order of the source triples as it is in the triples file. 
     * Key should be type of Integer as String has a different sort method 
     */
    public TreeMap<Integer, String> readSourceTriples(String file) {
        TreeMap<Integer, String> map = new TreeMap<>();
        BufferedReader br = null;
        String line = "";
        String cvsSplitBy = "\t";

        try {
            br = new BufferedReader(new FileReader(file));
            while ((line = br.readLine()) != null) {
                String[] resource = line.split(cvsSplitBy);
                map.put(new Integer(resource[0]), resource[1]);
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (br != null) {
                try {
                    br.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

        return map;

    }

    public void run(String triplesFile, String dirOutput, String service, String proxy)
            throws ClientProtocolException, IOException, URISyntaxException {
        TreeMap<Integer, String> map = new TreeMap<>();
        map = readSourceTriples(triplesFile);
        log.info("Crawling subject links...");
        String file = dirOutput + "subjectlinks.txt";
        Files.deleteIfExists(Paths.get(dirOutput + "subjectlinks.txt"));
        crawlAll(map, service, proxy, file);
        log.info("Crawling subject links completed. ");

    }

    public void crawlAll(TreeMap<Integer, String> targetUrls, String service, String proxy, String otfile) {
        SortedSet<Integer> results = Collections.synchronizedSortedSet(new TreeSet<Integer>());
        ExecutorService pool = Executors.newFixedThreadPool(100);

        int howManyUrls = targetUrls.size();
        System.out.println("total " + howManyUrls + " to be processed");

        List<String> output = Collections.synchronizedList(new ArrayList<String>());
        for (Integer targetId : targetUrls.navigableKeySet()) {
            String uri = targetUrls.get(targetId);
            pool.execute(new Explorer(targetId, uri, service, proxy, results, otfile, output));
        }
        pool.shutdown();

        while (results.size() < howManyUrls) {
            System.out.println("already processed " + results.size() + " subject links");
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                log.error("crawlAll error", e);
            }

        }

        resultToFile(output, otfile);
        System.out.println("already processed " + results.size() + " subject links");

    }

    // this is not synchronized and not append
    public void resultToFile(List<String> result, String fname) {
        File file = new File(fname);
        file.getParentFile().mkdirs();
        try {
            BufferedWriter bw = new BufferedWriter(new FileWriter(fname, false));
            synchronized (result) {
                for (String str : result) {
                    bw.write(str);
                    bw.write(System.lineSeparator());
                }
            }
            bw.close();
        } catch (IOException e) {
            log.error("write fetched subject links to file error", e);
        }

    }
}

class Explorer implements Runnable {
    final static Logger log = Logger.getLogger(Explorer.class.getName());
    Integer targetId;
    String uri;
    String service;
    String proxy;
    SortedSet<Integer> results;
    String fname;
    List<String> output;

    public Explorer(Integer targetId, String uri, String service, String proxy, SortedSet<Integer> results,
            String fname, List<String> output) {
        this.targetId = targetId;
        this.uri = uri;
        this.service = service;
        this.proxy = proxy;
        this.results = results;
        this.fname = fname;
        this.output = output;
    }

    @Override
    public void run() {
        List<String> candidateUrls = new ArrayList<String>();
        if (this.service.contains("sparql")) {
            String[] formated;

            try {
                formated = Crawler.crawlsaspty(this.uri, this.service, this.proxy);
                for (String str : formated) {
                    candidateUrls.add(this.targetId + "\t" + this.uri + "\t" + str + "\t" + "owl:sameAs");
                }
            } catch (IOException e) {
                log.error("crawling via sameAs property error", e);
            }

        }

        Equivalence equivalence;
        Crawler c = new Crawler();
        try {
            equivalence = c.sameAsService(this.uri);
            if (equivalence != null) {
                for (URI euri : equivalence) {

                    candidateUrls.add(this.targetId + "\t" + this.uri + "\t" + euri + "\t" + "sameas.org service");
                }
            }
        } catch (URISyntaxException e) {
            log.error("crawling via sameAs service error", e);
        }

        for (String url : candidateUrls) {
            output.add(url);
        }

        // add a processed URI to results
        this.results.add(this.targetId);

    }

}