org.epop.dataprovider.msacademic.MicrosoftAcademicSearch.java Source code

Java tutorial

Introduction

Here is the source code for org.epop.dataprovider.msacademic.MicrosoftAcademicSearch.java

Source

/*******************************************************************************
 * Copyright (c) 2012 fm&selab.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     fm&selab - initial API and implementation
 *     Florian Pollak - several modifications
 ******************************************************************************/
package org.epop.dataprovider.msacademic;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;

import net.htmlparser.jericho.Attribute;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;

import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIUtils;
import org.apache.http.impl.client.BasicResponseHandler;
import org.apache.http.impl.client.DefaultHttpClient;
import org.epop.dataprovider.DataProvider;
import org.epop.dataprovider.HTMLPage;
import org.epop.dataprovider.PatternMismatchException;
import org.epop.dataprovider.Utils;
import org.epop.utils.StringUtils;
import org.w3c.dom.Node;

import com.github.bfour.fpliteraturecollector.domain.Author;
import com.github.bfour.fpliteraturecollector.domain.Link;
import com.github.bfour.fpliteraturecollector.domain.Literature;
import com.github.bfour.fpliteraturecollector.domain.Literature.LiteratureType;
import com.github.bfour.fpliteraturecollector.domain.builders.AuthorBuilder;
import com.github.bfour.fpliteraturecollector.domain.builders.LiteratureBuilder;

public class MicrosoftAcademicSearch extends DataProvider {

    private static final String MS_ACADEMIC_SEARCH = "academic.research.microsoft.com/Search";
    Logger logger = Logger.getLogger(MicrosoftAcademicSearch.class.getCanonicalName());
    private static final long DELAY = 18611;
    private static final int searchStep = 10;

    private static final Pattern AUTHOR_ID_PATTERN = Pattern
            .compile(".*academic\\.research\\.microsoft\\.com/Author/(\\d+)/.*");
    private static final Pattern ID_PATTERN = Pattern.compile(".*Publication/(\\d+)/.*");

    @Override
    public String getDescription() {
        return "Microsoft Academic";
    }

    @Override
    protected Reader getHTMLDoc(String htmlParams, int pageTurnLimit, boolean initialWait) {

        // http://academic.research.microsoft.com/Search/?query=author:(%22Angelo%20gargantini%22)&start=1&end=10000
        // List<NameValuePair> qparams = new ArrayList<NameValuePair>();
        // qparams.add(new BasicNameValuePair("query","author:(\""+
        // q.getCompleteAuthorName()+"\")"));//with quotation marks
        // qparams.add(new BasicNameValuePair("query", "author:("
        // + q.getCompleteAuthorName() + ")"));// without quotation marks
        // qparams.add(new BasicNameValuePair("start", "1"));
        // qparams.add(new BasicNameValuePair("end",
        // String.valueOf(searchStep)));

        URI uri;
        String responseBody = "";

        try {

            if (initialWait)
                Thread.sleep(DELAY);

            uri = URIUtils.createURI("http", MS_ACADEMIC_SEARCH, -1, "", htmlParams, null);
            HttpGet httpget = new HttpGet(uri);
            System.out.println(httpget.getURI());
            HttpClient httpclient = new DefaultHttpClient();
            ResponseHandler<String> responseHandler = new BasicResponseHandler();
            responseBody = httpclient.execute(httpget, responseHandler);

            if (pageTurnLimit == 0)
                return new StringReader(responseBody);

            int counter = 1;
            String newResponseBody = responseBody;
            while (newResponseBody.contains(
                    "<a id=\"ctl00_MainContent_PaperList_Next\" title=\"Go to Next Page\" class=\"nextprev\"")) {

                Thread.sleep(DELAY);

                URI newUri = URIUtils.createURI("http", MS_ACADEMIC_SEARCH, -1, "",
                        htmlParams + "&start=" + String.valueOf((counter * searchStep) + 1) + "&end="
                                + String.valueOf((counter + 1) * searchStep),
                        null);

                httpget = new HttpGet(newUri);
                System.out.println(httpget.getURI());
                httpclient = new DefaultHttpClient();
                newResponseBody = httpclient.execute(httpget, responseHandler);
                // System.out.println(newResponseBody);
                responseBody = responseBody + newResponseBody;

                if (pageTurnLimit == counter)
                    return new StringReader(responseBody);

                counter++;

            }

        } catch (URISyntaxException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (ClientProtocolException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        // return the result as string
        return new StringReader(responseBody);
    }

    @Override
    protected List<Literature> parsePage(Reader page) {
        List<Literature> result = new ArrayList<Literature>();
        try {
            Source source = new Source(page);
            Iterator<Element> elementList = source.getAllElements(HTMLElementName.LI).iterator();
            while (elementList.hasNext()) {
                Element element = elementList.next();
                // System.out.println(element.toString());
                StartTag startTag = element.getStartTag();
                Attribute classAttr = startTag.getAttributes().get("class");
                if (classAttr != null && classAttr.getValue().equals("paper-item")) {
                    // System.out.println(element.toString());
                    Literature extractedPaper = extractPaper(element);
                    // System.out.println(extractedPaper);
                    if (extractedPaper != null)
                        result.add(extractedPaper);
                }
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return result;
    }

    private Literature extractPaper(Element element) {

        LiteratureBuilder litBuilder = new LiteratureBuilder();

        URI pageURI = null;

        for (Element s : element.getAllElements(HTMLElementName.DIV)) {
            Attribute classAttr = s.getStartTag().getAttributes().get("class");
            if (classAttr != null) {
                if (classAttr.getValue().equals("title-fullwidth") || classAttr.getValue().equals("title")) {

                    String id = s.getStartTag().getAttributes().get("id").getValue().replace("divTitle", "");

                    for (Element a : s.getAllElements(HTMLElementName.A)) {
                        // System.out.println(a.toString());
                        Attribute classAttr2 = a.getStartTag().getAttributes().get("id");
                        if (classAttr2 != null) {
                            if (classAttr2.getValue().equals(id + "Title")) {
                                Source htmlSource = new Source(a.getContent().toString());
                                String paperTitle = StringUtils
                                        .formatInLineSingleSpace(htmlSource.getTextExtractor().toString());
                                litBuilder.setTitle(paperTitle);
                                String pageURLString = "http://academic.research.microsoft.com"
                                        + a.getAttributeValue("href").substring(2);
                                if (litBuilder.getWebsiteURLs() == null)
                                    litBuilder.setWebsiteURLs(new HashSet<Link>());
                                try {
                                    pageURI = new URI(pageURLString);
                                    litBuilder.getWebsiteURLs().add(new Link("MS Academic", pageURI));
                                } catch (URISyntaxException e) {
                                    // TODO Auto-generated catch block
                                    e.printStackTrace();
                                }
                                Matcher idMatcher = ID_PATTERN.matcher(pageURLString);
                                if (idMatcher.find())
                                    litBuilder.setMsAcademicID(idMatcher.group(1));
                                // else
                                // TODO error handling
                            } else if (classAttr2.getValue().equals(id + "Citation")) {
                                Source htmlSource = new Source(a.getContent().toString());
                                String citedInfo = StringUtils.formatInLineSingleSpace(
                                        htmlSource.getTextExtractor().toString().replaceAll("Citations: ", ""));
                                try {
                                    Integer numCitations = Integer.parseInt(citedInfo);
                                    litBuilder.setMsAcademicNumCitations(numCitations);
                                } catch (NumberFormatException e) {
                                    // TODO error handling
                                    e.printStackTrace();
                                }
                            }
                        }
                    }
                } else if (classAttr.getValue().equals("content")) {

                    // authors

                    String[] parts = s.getContent().toString().split("<span class=\"span-break\"\\s?>, </span>");
                    for (String part : parts) {
                        AuthorBuilder authBuilder = new AuthorBuilder();
                        // ID
                        Source htmlSource = new Source(part);
                        if (htmlSource.getFirstElement("a") != null) {
                            String authorURL = htmlSource.getFirstElement("a").getAttributeValue("href");
                            java.util.regex.Matcher m = AUTHOR_ID_PATTERN.matcher(authorURL);
                            if (m.find()) {
                                authBuilder.setMsAcademicID(m.group(1));
                            } else {
                                // TODO feedback
                            }
                        }
                        // name
                        String nameString = htmlSource.getTextExtractor().toString();
                        try {
                            Utils.setFirstMiddleLastNameFromNameString(authBuilder, nameString);
                            if (litBuilder.getAuthors() == null)
                                litBuilder.setAuthors(new HashSet<Author>());
                            litBuilder.getAuthors().add(authBuilder.getObject());
                        } catch (PatternMismatchException e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        }
                    }

                } else if (classAttr.getValue().equals("conference")) {

                    Source htmlSource = new Source(s.getContent().toString());

                    if (htmlSource.getFirstElementByClass("conference-name") != null)
                        litBuilder.setPublicationContext(
                                htmlSource.getFirstElementByClass("conference-name").getTextExtractor().toString());

                    if (htmlSource.getFirstElement("span") != null) {
                        String contextType = htmlSource.getFirstElement("span").getTextExtractor().toString();
                        if (contextType.equals("Journal:"))
                            litBuilder.setType(LiteratureType.JOURNAL_PAPER);
                        if (contextType.equals("Conference:"))
                            litBuilder.setType(LiteratureType.CONFERENCE_PAPER);
                    }

                    for (Element q : s.getAllElements(HTMLElementName.SPAN)) {
                        // System.out.println(q.toString());
                        Attribute classAttr3 = q.getStartTag().getAttributes().get("class");
                        if (classAttr3 != null && classAttr3.getValue().equals("year")) {
                            htmlSource = new Source(q.getContent().toString());
                            String venueYearStr = StringUtils
                                    .formatInLineSingleSpace(htmlSource.getTextExtractor().toString());
                            venueYearStr = venueYearStr.substring(venueYearStr.lastIndexOf(",") + 1).trim();
                            Integer venueYear;
                            try {
                                venueYear = Integer.parseInt(venueYearStr);
                            } catch (NumberFormatException ae) {
                                venueYear = null;
                            }
                            litBuilder.setYear(venueYear);
                            break;
                        }
                        // class="year"
                    }
                }
            }
        }

        if (pageURI != null) {
            HTMLPage page;
            try {
                try {
                    Thread.sleep(1861);
                } catch (InterruptedException e1) {
                    e1.printStackTrace();
                }
                page = new HTMLPage(pageURI);
                try {
                    Node abstractTextNode = page.getNodeByXPath(".//div[@class='abstract']/span");
                    if (abstractTextNode != null)
                        litBuilder.setAbstractText(abstractTextNode.getTextContent());
                } catch (XPathExpressionException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                try {
                    String doiString = page
                            .getStringByXPath(".//*[@id='ctl00_MainContent_PaperItem_hypDOIText']/text()");
                    if (!doiString.isEmpty())
                        litBuilder.setDOI(doiString);
                } catch (XPathExpressionException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            } catch (IOException | ParserConfigurationException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }

        return litBuilder.getObject();

    }
}