org.apache.marmotta.ucuenca.wk.provider.gs.GoogleScholarProvider.java Source code

Introduction

Here is the source code for org.apache.marmotta.ucuenca.wk.provider.gs.GoogleScholarProvider.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.marmotta.ucuenca.wk.provider.gs;

//import org.apache.commons.lang3.StringUtils;
//import org.apache.marmotta.commons.vocabulary.FOAF;
//import com.google.common.base.Preconditions;
import com.google.gson.Gson;
import com.google.gson.JsonArray;

import org.apache.marmotta.ldclient.api.endpoint.Endpoint;
import org.apache.marmotta.ldclient.exception.DataRetrievalException;
import org.apache.marmotta.ldclient.model.ClientConfiguration;
import org.apache.marmotta.ldclient.model.ClientResponse;
import org.apache.marmotta.ldclient.services.ldclient.LDClient;
import org.apache.marmotta.ldclient.services.provider.AbstractHttpProvider;
import org.apache.marmotta.ucuenca.wk.provider.gs.util.GSXMLHandler;
import org.apache.marmotta.ucuenca.wk.provider.gs.util.GSresult;
import org.apache.marmotta.ucuenca.wk.provider.gs.util.JSONtoRDF;
//import org.jdom2.Document;
//import org.jdom2.Element;
//import org.jdom2.JDOMException;
//import org.jdom2.filter.ElementFilter;
//import org.jdom2.input.SAXBuilder;
//import org.jdom2.input.sax.XMLReaders;
//import org.jdom2.xpath.XPathFactory;
import org.openrdf.model.Model;
//import org.openrdf.model.ValueFactory;
//import org.openrdf.model.impl.ValueFactoryImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.IOException;
import java.io.InputStream;
//import java.net.URL;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
//import java.util.logging.Level;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Support Google Scholar information as RDF
 * <p/>
 * Author: Santiago Gonzalez
 */
public class GoogleScholarProvider extends AbstractHttpProvider {

    public static final String NAME = "Google Scholar Provider";
    public static final String API = "http://scholar.google.com/scholar?start=%s&q=author:%%22%s%%22%s&hl=en&as_sdt=1%%2C15&as_vis=1%s";
    public static final String PATTERN = "http(s?)://scholar\\.google\\.com/scholar\\?start\\=0\\&q=author\\:%22(.*)%22\\&hl=en\\&as_sdt\\=1%2C15\\&as_vis\\=1(.*)$";
    private static String nsUcuenca = "https://www.cedia.org.ec/";

    private static Logger log = LoggerFactory.getLogger(GoogleScholarProvider.class);

    private String stringSearch = null, authorSearch = null, advancedSearch = null;

    public static final ConcurrentMap<String, String> MAPPINGSCHEMA = new ConcurrentHashMap<String, String>();

    static {
        MAPPINGSCHEMA.put("entity::type", "http://purl.org/ontology/bibo/Document");
        MAPPINGSCHEMA.put("entity::property:link", "http://purl.org/ontology/bibo/uri");
        MAPPINGSCHEMA.put("entity::property:title", "http://purl.org/dc/terms/title");
        MAPPINGSCHEMA.put("entity::property:text", "http://purl.org/ontology/bibo/abstract");
        MAPPINGSCHEMA.put("entity::property:journal", "http://purl.org/ontology/bibo/Journal");
        MAPPINGSCHEMA.put("entity::property:date", "http://purl.org/dc/elements/1.1/date");
        MAPPINGSCHEMA.put("entity::property:doi", "http://purl.org/ontology/bibo/doi");
        MAPPINGSCHEMA.put("entity::property:authorlist", "http://purl.org/ontology/bibo/authorList");
        MAPPINGSCHEMA.put("entity::property:quote", "http://purl.org/ontology/bibo/Quote");
        MAPPINGSCHEMA.put("entity::property:conference", "http://purl.org/ontology/bibo/Conference");
        MAPPINGSCHEMA.put("entity::property:cites", "http://purl.org/ontology/bibo/cites");
        MAPPINGSCHEMA.put("entity::property:type", nsUcuenca + "type");
        MAPPINGSCHEMA.put("entity::property:referenceCount", nsUcuenca + "referenceCount");
        MAPPINGSCHEMA.put("entity::property:citationCount", nsUcuenca + "citationCount");
        MAPPINGSCHEMA.put("entity::property:contributor", "http://purl.org/dc/terms/contributor");
        MAPPINGSCHEMA.put("entity::property:pdf", nsUcuenca + "pdf");
        MAPPINGSCHEMA.put("entity::property:cites", "http://purl.org/ontology/bibo/cites");
        MAPPINGSCHEMA.put("entity::property:fulltextlink", "http://purl.org/ontology/bibo/content");
        MAPPINGSCHEMA.put("entity::property:creator", "http://purl.org/dc/elements/1.1/creator");

    }

    /**
     * Return the name of this data provider. To be used e.g. in the
     * configuration and in log messages.
     *
     * @return
     */
    @Override
    public String getName() {
        return NAME;
    }

    /**
     * Return the list of mime types accepted by this data provider.
     *
     * @return
     */
    @Override
    public String[] listMimeTypes() {
        return new String[] { "text/html" };
    }

    /**
     * Build the URL to use to call the webservice in order to retrieve the data
     * for the resource passed as argument. In many cases, this will just return
     * the URI of the resource (e.g. Linked Data), but there might be data
     * providers that use different means for accessing the data for a resource,
     * e.g. SPARQL or a Cache.
     *
     *
     * @param resource
     * @param endpoint endpoint configuration for the data provider (optional)
     * @return
     */
    @Override
    public List<String> buildRequestUrl(String resource, Endpoint endpoint) {
        String url = null;
        Matcher m = Pattern.compile(PATTERN).matcher(resource);
        if (m.find()) {
            stringSearch = m.group(2);
            authorSearch = m.group(3);
            advancedSearch = m.group(3);
            log.debug("Extracting info for: {0}", stringSearch);
            if (authorSearch.length() > 0) {
                log.debug("Extra author search parameters: {0}", authorSearch);
            }
            if (advancedSearch.length() > 0) {
                log.debug("Advanced search parameters: {0}", advancedSearch);
            }
            url = resource;
        }
        return Collections.singletonList(url);
    }

    @Override
    public List<String> parseResponse(String resource, String requestUrl, Model triples, InputStream input,
            String contentType) throws DataRetrievalException {
        log.debug("Request Successful to {0}", requestUrl);
        final GSXMLHandler gsXMLHandler = new GSXMLHandler();
        gsXMLHandler.clearGSresultList();
        try {
            XMLReader xr = XMLReaderFactory.createXMLReader("org.ccil.cowan.tagsoup.Parser");
            xr.setContentHandler(gsXMLHandler);
            InputSource gsxml = new InputSource(input);
            gsxml.setEncoding("iso-8859-1");
            xr.parse(gsxml);

            final Set<GSresult> gsresultlist = gsXMLHandler.getGSresultList();
            Gson gson = new Gson();
            JsonArray json = new JsonArray();
            for (GSresult d : gsresultlist) {
                json.add(gson.toJsonTree(d).getAsJsonObject());
            }
            JSONtoRDF parser = new JSONtoRDF(resource, MAPPINGSCHEMA, json, triples);
            try {
                parser.parse();
            } catch (Exception e) {
                throw new DataRetrievalException("I/O exception while retrieving resource: " + requestUrl, e);
            }
            int numPages = (int) ((double) (gsXMLHandler.getNumResults() / 10)) + 1;
            int pagesLoaded = 1;
            Model model = null;
            while (pagesLoaded < numPages) {

                String pagenumquery = Integer.toString(pagesLoaded * 10);
                String moreDataUrl = String.format(API, pagenumquery, stringSearch, authorSearch, advancedSearch);
                ClientConfiguration conf = new ClientConfiguration();
                LDClient ldClient = new LDClient(conf);
                ClientResponse response = ldClient.retrieveResource(moreDataUrl);
                Model pageModel = response.getData();
                if (model == null) {
                    model = pageModel;
                } else {
                    model.addAll(pageModel);
                }
                pagesLoaded++;
            }
            triples.addAll(model);

        } catch (SAXException | IOException e) {
            throw new DataRetrievalException("I/O exception while retrieving resource: " + requestUrl, e);
        }

        //       try {
        //          List<String> candidates = new ArrayList<String>();
        //          ValueFactory factory = ValueFactoryImpl.getInstance();
        //          final Document doc = new SAXBuilder(XMLReaders.NONVALIDATING).build(input);
        //          for(Element element: queryElements(doc, "/result/hits/hit/info/url")) {
        //             String candidate = element.getText();
        //             triples.add(factory.createStatement(factory.createURI( resource ), FOAF.member, factory.createURI( candidate ) ));
        //             candidates.add(candidate);
        //          }
        //          ClientConfiguration conf = new ClientConfiguration();
        //           LDClient ldClient = new LDClient(conf);
        //           if(!candidates.isEmpty()) {
        //              Model candidateModel = null;
        //             for(String author: candidates) {
        //                ClientResponse response = ldClient.retrieveResource(author);
        //                 Model authorModel = response.getData();
        //                 if(candidateModel == null) {
        //                    candidateModel = authorModel;
        //                 } else {
        //                    candidateModel.addAll(authorModel);
        //                 }
        //             }
        //             triples.addAll(candidateModel);
        //           }
        //       }catch (IOException e) {
        //            throw new DataRetrievalException("I/O error while parsing HTML response", e);
        //        }catch (JDOMException e) {
        //            throw new DataRetrievalException("could not parse XML response. It is not in proper XML format", e);
        //        }
        return Collections.emptyList();
    }

    //    protected static List<Element> queryElements(Document n, String query) {
    //        return XPathFactory.instance().compile(query, new ElementFilter()).evaluate(n);
    //    }
}