de.mpg.escidoc.services.cone.util.CCCrawler.java Source code

Java tutorial

Introduction

Here is the source code for de.mpg.escidoc.services.cone.util.CCCrawler.java

Source

/*
*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at license/ESCIDOC.LICENSE
* or http://www.escidoc.org/license.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at license/ESCIDOC.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright 2006-2012 Fachinformationszentrum Karlsruhe Gesellschaft
* fr wissenschaftlich-technische Information mbH and Max-Planck-
* Gesellschaft zur Frderung der Wissenschaft e.V.
* All rights reserved. Use is subject to license terms.
*/

package de.mpg.escidoc.services.cone.util;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;

import de.mpg.escidoc.services.cone.Querier;
import de.mpg.escidoc.services.cone.QuerierFactory;
import de.mpg.escidoc.services.framework.ProxyHelper;

/**
 * TODO Description
 *
 * @author franke (initial creation)
 * @author $Author$ (last modification)
 * @version $Revision$ $LastChangedDate$
 *
 */
public class CCCrawler {

    private static final String ccUrl = "http://creativecommons.org/license/results-one?"
            + "q_1=2&q_1=1&field_format=&field_worktitle=&field_attribute_to_name=&"
            + "field_attribute_to_url=&field_sourceurl=&field_morepermissionsurl=&n_questions=3";

    public enum YesNo {
        n, yes;

        public String toBoolean() {
            return (this == YesNo.yes) + "";
        }
    }

    public enum Language {
        af, bg, ca, cs, da, de, en, en_CA, en_GB, en_HK, en_SG, en_US, eo, es, es_AR, es_CL, es_CO, es_EC, es_GT, es_MX, es_PE, eu, fi, fr, fr_CA, gl, he, hr, hu, it, ja, ko, mk, ms, nl, no, nso, pl, pt, ro, sl, sr, sr_LATN, st, sv, th, zh, zh_HK, zh_TW, zu
    }

    public enum Jurisdiction {
        ar, at, au, be, bg, br, ca, ch, cl, cn, co, cz, de, dk, ec, es, fi, fr, gr, gt, hk, hr, hu, il, in, it, jp, kr, lu, mk, mt, mx, my, nl, no, nz, pe, ph, pl, pr, pt, ro, rs, scotland, se, sg, si, th, tw, uk, us, za
    }

    //        &field_commercial=yes
    //        &field_derivatives=yes
    //        &field_jurisdiction=de

    //        &lang=de_DE
    //        &language=de_DE
    //        

    private static Querier querier = null;

    public static void main(String[] args) throws Exception {
        HttpClient httpClient = new HttpClient();

        querier = QuerierFactory.newQuerier(false);

        // field_commercial
        for (YesNo fieldCommercial : YesNo.values()) {
            // field_derivatives
            for (YesNo fieldDerivatives : YesNo.values()) {
                // field_derivatives
                for (Jurisdiction fieldJurisdiction : Jurisdiction.values()) {
                    String licenceUrl = ccUrl + "&field_commercial=" + fieldCommercial.toString()
                            + "&field_derivatives=" + fieldDerivatives.toString() + "&field_jurisdiction="
                            + fieldJurisdiction.toString() + "&lang=de_DE";
                    System.out.println(licenceUrl);
                    GetMethod method = new GetMethod(licenceUrl);
                    ProxyHelper.executeMethod(httpClient, method);

                    if (method.getStatusCode() == 200) {
                        TreeFragment fragment = new TreeFragment();

                        String key1 = "urn:cone:commercial";
                        String key3 = "urn:cone:jurisdiction";

                        List<LocalizedTripleObject> list = new ArrayList<LocalizedTripleObject>();
                        list.add(new LocalizedString(fieldCommercial.toBoolean()));
                        fragment.put(key1, list);

                        List<LocalizedTripleObject> list2 = new ArrayList<LocalizedTripleObject>();
                        list2.add(new LocalizedString(fieldDerivatives.toBoolean()));
                        fragment.put("urn:cone:derivatives", list2);

                        List<LocalizedTripleObject> list3 = new ArrayList<LocalizedTripleObject>();
                        list3.add(new LocalizedString(fieldJurisdiction.toString()));
                        fragment.put(key3, list3);

                        String codeToCopy = extractCode(method);

                        Pattern urlPattern = Pattern.compile("href=\"([^\"]+)\"");
                        Matcher urlMatcher = urlPattern.matcher(codeToCopy);
                        if (urlMatcher.find()) {
                            String url = urlMatcher.group(1);
                            fragment.setSubject(url);

                            Pattern versionPattern = Pattern.compile("/(\\d+\\.\\d+)/[^/]+/$");
                            Matcher versionMatcher = versionPattern.matcher(url);
                            if (versionMatcher.find()) {
                                list = new ArrayList<LocalizedTripleObject>();
                                list.add(new LocalizedString(versionMatcher.group(1)));
                                fragment.put("urn:cone:version", list);
                            }

                            Pattern imgPattern = Pattern.compile("src=\"([^\"]+)\"");
                            Matcher imgMatcher = imgPattern.matcher(codeToCopy);
                            if (imgMatcher.find()) {
                                list = new ArrayList<LocalizedTripleObject>();
                                list.add(new LocalizedString(imgMatcher.group(1)));
                                fragment.put("http://xmlns.com/foaf/0.1/depiction", list);
                            }

                            GetMethod method2 = new GetMethod(url);
                            ProxyHelper.executeMethod(httpClient, method2);
                            String page = method2.getResponseBodyAsString();

                            Pattern namePattern = Pattern.compile("<h2 property=\"dc:title\">([^<]+)</h2>");
                            Matcher nameMatcher = namePattern.matcher(page);
                            if (nameMatcher.find()) {
                                list = new ArrayList<LocalizedTripleObject>();
                                list.add(new LocalizedString(nameMatcher.group(1)));
                                fragment.put("http://purl.org/dc/elements/1.1/title", list);
                            }

                            List<LocalizedTripleObject> languages = extractLanguages(page, url);

                            fragment.put("urn:cone:translation", languages);

                            querier.delete("cclicences", url);
                            querier.create("cclicences", url, fragment);
                        }

                    } else {
                        System.out.println("Not found: " + licenceUrl);
                    }
                }
            }
        }
        querier.release();
    }

    private static List<LocalizedTripleObject> extractLanguages(String page, String baseURL) throws Exception {
        HttpClient httpClient = new HttpClient();
        List<LocalizedTripleObject> result = new ArrayList<LocalizedTripleObject>();

        Pattern pattern = Pattern.compile(
                "<a\\s+href=\"./([^\"]+)\"\\s+title=\"([^\"]+)\"\\s+hreflang=\"([^\"]+)\"\\s+rel=\"alternate nofollow\"\\s+(xml:)?lang=\"([^\"]+)\">");
        Matcher matcher = pattern.matcher(page);
        int start = 0;
        while (matcher.find(start)) {
            String genid = querier.createUniqueIdentifier(null);
            TreeFragment treeFragment = new TreeFragment(genid);

            String locale = matcher.group(3);

            treeFragment.setLanguage(locale.split("_")[0]);

            String url = baseURL + matcher.group(1);
            List<LocalizedTripleObject> list = new ArrayList<LocalizedTripleObject>();
            list.add(new LocalizedString(url));
            treeFragment.put("http://purl.org/dc/elements/1.1/identifier", list);

            list = new ArrayList<LocalizedTripleObject>();
            list.add(new LocalizedString(matcher.group(2)));
            treeFragment.put("http://purl.org/dc/elements/1.1/title", list);

            list = new ArrayList<LocalizedTripleObject>();
            list.add(new LocalizedString(locale));
            treeFragment.put("urn:cone:locale", list);

            GetMethod method = new GetMethod(url);
            ProxyHelper.executeMethod(httpClient, method);
            String translation = method.getResponseBodyAsString();

            Pattern namePattern = Pattern.compile("<h2 property=\"dc:title\">([^<]+)</h2>");
            Matcher nameMatcher = namePattern.matcher(translation);
            if (nameMatcher.find()) {
                list = new ArrayList<LocalizedTripleObject>();
                list.add(new LocalizedString(nameMatcher.group(1)));
                treeFragment.put("http://purl.org/dc/elements/1.1/title", list);
            }

            result.add(treeFragment);

            start = matcher.end();
        }
        return result;
    }

    /**
     * @param method
     * @throws IOException
     */
    private static String extractCode(GetMethod method) throws IOException {
        String page = method.getResponseBodyAsString();
        if (page.contains("<textarea id=\"codetocopy\"")) {
            int start = page.indexOf("<textarea id=\"codetocopy\"") + 25;
            start = page.indexOf(">", start) + 1;
            int end = page.indexOf("</textarea>", start);
            String result = page.substring(start, end);
            result = decode(result);
            return result;

        } else {
            System.out.println("codetocopy not found: " + method.getPath());
            return null;
        }
    }

    private static String decode(String result) {
        result = result.replace("&lt;", "<");
        result = result.replace("&gt;", ">");
        result = result.replace("&amp;", "&");
        return result;
    }

}