KnowledgeBooksNlpGenerateRdfPropertiesFromWebPages.java Source code

Introduction

Here is the source code for KnowledgeBooksNlpGenerateRdfPropertiesFromWebPages.java
Source

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;

import com.knowledgebooks.nlp.AutoTagger;
import com.knowledgebooks.nlp.util.NameValue;
import com.knowledgebooks.nlp.ExtractNames;
import com.knowledgebooks.nlp.util.ScoredList;
import com.knowledgebooks.info_spiders.WebSpider;
import org.apache.commons.io.FileUtils;

/**
 * Copyright Mark Watson 2008-2010. All Rights Reserved.
 * License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt)
 */

public class KnowledgeBooksNlpGenerateRdfPropertiesFromWebPages {
    public KnowledgeBooksNlpGenerateRdfPropertiesFromWebPages(String config_file_path, PrintWriter out)
            throws IOException {
        this.out = out;
        extractNames = new ExtractNames();
        autoTagger = new AutoTagger();
        List<String> lines = (List<String>) FileUtils.readLines(new File(config_file_path));
        for (String line : lines) {
            Scanner scanner = new Scanner(line);
            scanner.useDelimiter(" ");
            try {
                String starting_url = scanner.next();
                int spider_depth = Integer.parseInt(scanner.next());
                spider(starting_url, spider_depth);
            } catch (Exception ex) {
                ex.printStackTrace();
            }
        }
        this.out.close();
    }

    private void spider(String starting_url, int spider_depth) throws Exception {
        System.out.println("** spider(" + starting_url + ", " + spider_depth + ")");
        WebSpider ws = new WebSpider(starting_url, spider_depth);
        for (List<String> ls : ws.url_content_lists) {
            String url = ls.get(0);
            String text = ls.get(1);
            HashSet<String> hs = new HashSet<String>();
            System.out.println("\n\n\n----URL:\n" + url + "\n    content:\n" + text);

            ScoredList[] names = extractNames.getProperNames(text);
            ScoredList people = names[0];
            ScoredList places = names[1];
            List<NameValue<String, Float>> tags = autoTagger.getTags(text);

            out.println("<" + url
                    + "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://knowledgebooks.com/rdf/webpage> .");
            out.println("<" + url + "> <http://knowledgebooks.com/rdf/contents> \""
                    + text.trim().replaceAll("\"", "'") + "\" .");
            for (String person : people.getStrings()) {
                out.println("<" + url + "> <http://knowledgebooks.com/rdf/containsPerson> \""
                        + person.replaceAll("\"", "'") + "\" .");
            }
            for (String place : places.getStrings()) {
                out.println("<" + url + "> <http://knowledgebooks.com/rdf/containsPlace> \""
                        + place.replaceAll("\"", "'") + "\" .");
            }
            for (NameValue nv : tags) {
                out.println("<" + url + "> <http://knowledgebooks.com/rdf/" + nv.getName() + "> \""
                        + ("" + nv.getValue()) + "\" .");
                hs.add("" + nv.getName());
            }
            inter_webpage_shared_tags.put(url, hs);
        }
        process_interpage_shared_properties();
    }

    private void process_interpage_shared_properties() throws Exception {
        Set<String> unique_urls = inter_webpage_shared_tags.keySet();
        for (String url_1 : unique_urls) {
            for (String url_2 : unique_urls) {
                if (url_1.equals(url_2) == false) {
                    System.out.println("\n\n^^^^^^^^^ " + url_1 + " : " + url_2 + "\n");
                    float url_similarity = score_mapset(inter_webpage_shared_tags.get(url_1),
                            inter_webpage_shared_tags.get(url_2));
                    if (url_similarity > 12f) {
                        out.println("<" + url_1 + "> <http://knowledgebooks.com/rdf/high_similarity> <" + url_2
                                + "> .");
                    } else if (url_similarity > 5f) {
                        out.println("<" + url_1 + "> <http://knowledgebooks.com/rdf/medium_similarity> <" + url_2
                                + "> .");
                    } else if (url_similarity > 5f) {
                        out.println(
                                "<" + url_1 + "> <http://knowledgebooks.com/rdf/low_similarity> <" + url_2 + "> .");
                    }
                }
            }
        }
    }

    private float score_mapset(Set<String> set_1, Set<String> set_2) {
        set_1.retainAll(set_2); // replace contents of set_1 with intersection of set_1 and set_2
        return set_1.size();
    }

    private PrintWriter out = null;
    private Map<String, Set<String>> inter_webpage_shared_tags = new HashMap<String, Set<String>>();;
    private ExtractNames extractNames = null;
    private AutoTagger autoTagger = null;

    public static void main(String[] args) throws Exception {
        new KnowledgeBooksNlpGenerateRdfPropertiesFromWebPages("testdata/websites.txt",
                new PrintWriter("tempdata/gen_rdf.nt"));
    }
}