org.epop.dataprovider.acmdigital.ACMDigitalLibrarySearch.java Source code

Introduction

Here is the source code for org.epop.dataprovider.acmdigital.ACMDigitalLibrarySearch.java
Source

/*
 * Copyright (c) 2012 fm&selab.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     fm&selab - initial API and implementation
 *     TESI DI Claudio Capelli c.capelli2@studenti.unibg.it
 *     Florian Pollak - several modifications
 */
package org.epop.dataprovider.acmdigital;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;

import net.htmlparser.jericho.Attribute;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;

import org.apache.http.client.ClientProtocolException;
import org.apache.log4j.Logger;
import org.epop.dataprovider.DataProvider;
import org.epop.dataprovider.HTMLPage;
import org.epop.dataprovider.PatternMismatchException;
import org.epop.dataprovider.Utils;
import org.epop.utils.StringUtils;
import org.w3c.dom.Node;

import com.github.bfour.fpliteraturecollector.domain.Author;
import com.github.bfour.fpliteraturecollector.domain.Link;
import com.github.bfour.fpliteraturecollector.domain.Literature;
import com.github.bfour.fpliteraturecollector.domain.builders.AuthorBuilder;
import com.github.bfour.fpliteraturecollector.domain.builders.LiteratureBuilder;

public class ACMDigitalLibrarySearch extends DataProvider {

    private static final String DOMAIN = "http://dl.acm.org/";
    private static final String BASE_URL = "http://dl.acm.org/results.cfm";
    private static final long DELAY = 28611;
    private static final String AUTHOR_ID_PATTERN_STRING = ".*author_page\\.cfm\\?id=(\\d+).*";
    private static Pattern AUTHOR_ID_PATTERN;
    private static final String CITATION_COUNT_PATTERN_STRING = ".*Citation Count: (\\d+).*";
    private static Pattern CITATION_COUNT_PATTERN;
    private static final String ID_PATTERN_STRING = ".*[?&]id=((\\d+)\\.?(\\d+)?).*";
    private static Pattern ID_PATTERN;

    static private Logger logger = Logger.getLogger(ACMDigitalLibrarySearch.class);

    @Override
    public String getDescription() {
        return "ACM Digital Library";
    }

    @Override
    /**getHTMLDoc
     * connect to the server and return the response at the query, call the method "getParameters"
     * in order to have parameters to create the uri
     * @return responseBody A string that contains the response at the query
     */
    protected Reader getHTMLDoc(String htmlParams, int pageTurnLimit, boolean initialWait) {

        boolean exit = false; // flag to exit if the author was not found

        URI uri;
        String responseBody = "";

        try {

            if (initialWait)
                Thread.sleep(DELAY);

            uri = new URI(BASE_URL + "?" + htmlParams);
            HTMLPage page = new HTMLPage(uri);
            responseBody = page.getRawCode();

            if (pageTurnLimit == 0)
                return new StringReader(responseBody);

            int counter = 1; // there are 20 records for page
            String newResponseBody = responseBody;

            // Verifies if the author was not found
            if (responseBody.contains("was not found.")) {
                exit = true;
            }

            // take all the pages if the author was found
            while (newResponseBody.contains("next</a>") && !exit) {

                Thread.sleep(DELAY);

                URI newUri = new URI(BASE_URL + "?" + htmlParams + "&start=" + String.valueOf((counter * 20) + 1));
                newResponseBody = new HTMLPage(newUri).getRawCode();
                responseBody = responseBody + newResponseBody;

                if (pageTurnLimit == counter)
                    return new StringReader(responseBody);

                counter++;

            }

        } catch (URISyntaxException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (ClientProtocolException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (ParserConfigurationException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        // logger.debug("responseBody =\n" + responseBody);
        // return the response
        return new StringReader(responseBody);

    }

    @Override
    /**parsePage
     * Parse a page and call the method "extractPaper" in order to build a list of papers
     * @param page Reader object that contains the html page
     * @return result List<Paper> that contains a list of papers
     */
    protected List<Literature> parsePage(Reader page) {

        List<Literature> result = new ArrayList<Literature>();

        try {
            Source source = new Source(page); // defines the source to parse
            /**
             * get elements with TR, style=padding-bottom: 15px and elements
             * style=padding-bottom: 5px; for notFound
             */
            Iterator<Element> elementList = source.getAllElements(HTMLElementName.TD).iterator();
            // iterates elements with tag "td"
            while (elementList.hasNext()) {
                Element element = elementList.next();
                StartTag startTag = element.getStartTag();
                Attribute Attr = startTag.getAttributes().get("style");

                // call the method notFound to verify if the query is found
                if (Attr != null && Attr.getValue().equals("padding-bottom: 5px;")) {
                    if (notFound(element)) {
                        return result;
                    }
                }
                // style="padding-bottom: 15px" is common for every record in
                // the page
                if (Attr != null && Attr.getValue().equals("padding-bottom: 15px")) {
                    // Logger.debug(element.toString());
                    Literature extractedPaper = extractPaper(element);
                    // logger.debug(extractedPaper);

                    if (extractedPaper != null)
                        result.add(extractedPaper);
                }
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return result;
    }

    /**
     * extractPaper Extracts the parameters
     * 
     * @param element
     *            Element that contains the HTML element to analyze
     * @return Paper Object that contains the extracted parameters
     */
    private Literature extractPaper(Element element) {

        try {
            if (AUTHOR_ID_PATTERN == null)
                AUTHOR_ID_PATTERN = Pattern.compile(AUTHOR_ID_PATTERN_STRING);
            if (CITATION_COUNT_PATTERN == null)
                CITATION_COUNT_PATTERN = Pattern.compile(CITATION_COUNT_PATTERN_STRING);
        } catch (PatternSyntaxException e) {
            // TODO implement error handling
            e.printStackTrace();
            return null;
        }

        LiteratureBuilder builder = new LiteratureBuilder();

        /**
         * extract paperTitle analyze all the elements with tag "a", attribute
         * class="medium-text" and extract the title
         */
        URI pageURI = null;
        for (Element a : element.getAllElements(HTMLElementName.A)) {
            Attribute classAttr = a.getStartTag().getAttributes().get("class");
            if (classAttr != null) {
                if (classAttr.getValue().equals("medium-text")) {
                    Source htmlSource = new Source(a.getContent().toString());
                    String title = StringUtils.formatInLineSingleSpace(htmlSource.getTextExtractor().toString());
                    logger.debug(title);
                    builder.setTitle(title);
                    String href = a.getAttributeValue("href");
                    if (builder.getWebsiteURLs() == null)
                        builder.setWebsiteURLs(new HashSet<Link>());
                    try {
                        pageURI = new URI(DOMAIN + href + "&preflayout=flat");
                        builder.getWebsiteURLs().add(new Link("ACM", pageURI));
                    } catch (URISyntaxException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                    try {
                        if (ID_PATTERN == null)
                            ID_PATTERN = Pattern.compile(ID_PATTERN_STRING);
                        Matcher idMatcher = ID_PATTERN.matcher(href);
                        if (idMatcher.find()) {
                            builder.setAcmID(idMatcher.group(1));
                        }
                        // else
                        // TODO error handling
                    } catch (PatternSyntaxException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }
        }

        /**
         * extract paperAuthors analyze all the elements with tag "div",
         * attribute class="authors" and extract the authors
         */
        for (Element s : element.getAllElements(HTMLElementName.DIV)) {
            Attribute classAttr = s.getStartTag().getAttributes().get("class");
            if (classAttr != null) {
                if (classAttr.getValue().equals("authors")) {
                    Set<Author> authors = new HashSet<>();
                    for (Element q : s.getAllElements(HTMLElementName.A)) {
                        Source htmlSource = new Source(q.getContent().toString());
                        AuthorBuilder authBuilder = new AuthorBuilder();
                        String paperAuthor = StringUtils
                                .formatInLineSingleSpace(htmlSource.getTextExtractor().toString());
                        try {
                            Utils.setFirstMiddleLastNameFromNameString(authBuilder, paperAuthor);
                            Matcher matcher = AUTHOR_ID_PATTERN.matcher(q.getAttributeValue("href"));
                            if (matcher.find()) {
                                authBuilder.setAcmID(matcher.group(1));
                            } else {
                                // TODO error handling
                            }
                            authors.add(authBuilder.getObject());
                        } catch (PatternMismatchException e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        }
                    }
                    builder.setAuthors(authors);
                    /**
                     * extract paperPlace analyze all the elements with tag
                     * "div", attribute class="addinfo" and extract the place
                     */
                } else if (classAttr.getValue().equals("addinfo")) {
                    for (Element g : s.getAllElements(HTMLElementName.DIV)) {
                        Source htmlSource = new Source(g.getContent().toString());
                        String paperPlace = StringUtils
                                .formatInLineSingleSpace(htmlSource.getTextExtractor().toString());
                        logger.debug(paperPlace);
                        builder.setPublicationContext(paperPlace);
                    }
                }
            }
        }

        /**
         * extract citedInfo analyze all the elements with tag "td" that
         * contains the string "Bibliometrics", replace keywords that identify
         * the parameters with a separator, used to split the string.The
         * unnecessary text is deleted calling the "replace" method of the class
         * String.Finally is extracted the info using StringTokenizer
         */
        for (Element y : element.getAllElements(HTMLElementName.TD)) {
            if (y.toString().contains("Bibliometrics")) {
                Matcher matcher = CITATION_COUNT_PATTERN.matcher(y.getContent());
                if (matcher.find()) {
                    builder.setAcmNumCitations(Integer.parseInt(matcher.group(1)));
                } else {
                    // TODO error handling
                }
            }

            /**
             * extract venueYear analyze all the elements with tag "td" that
             * contains a month and call the method ExtractYear to extract the
             * year
             */
            if (y.toString().contains("January") || y.toString().contains("February")
                    || y.toString().contains("March") || y.toString().contains("April")
                    || y.toString().contains("May") || y.toString().contains("June")
                    || y.toString().contains("July") || y.toString().contains("August")
                    || y.toString().contains("September") || y.toString().contains("October")
                    || y.toString().contains("November") || y.toString().contains("December")) {
                String s = y.getContent().toString();
                String venueYear = ExtractYear(s);
                logger.debug(venueYear);
                if (!venueYear.isEmpty()) {
                    builder.setYear(Integer.parseInt(venueYear));
                }

            }
        }

        try {
            try {
                Thread.sleep(2861);
            } catch (InterruptedException e1) {
                e1.printStackTrace();
            }
            HTMLPage entryPage = new HTMLPage(pageURI);
            try {
                Node abstractTextNode = entryPage.getNodeByXPath("//*[@id='fback']/div[3]/div[1]");
                if (abstractTextNode != null)
                    builder.setAbstractText(abstractTextNode.getTextContent());
            } catch (XPathExpressionException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            try {
                Node fullTextLinkNode = entryPage.getNodeByXPath(
                        "//*[@id='divmain']/table[1]/tbody/tr/td[1]/table[1]/tbody/tr/td[2]/a[@name='FullTextPDF']/@href");
                if (fullTextLinkNode != null) {
                    String href = fullTextLinkNode.getTextContent(); // ft_gateway.cfm?id=1150304&ftid=371641&dwn=1&CFID=553795364&CFTOKEN=65402853
                    if (builder.getFulltextURLs() == null)
                        builder.setFulltextURLs(new HashSet<Link>());
                    URI fulltextURI = new URI(DOMAIN + href);
                    builder.getFulltextURLs().add(new Link("ACM", fulltextURI));
                }
            } catch (XPathExpressionException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (URISyntaxException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        } catch (IOException | ParserConfigurationException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        // return a new Paper if a title and a paper was found
        if (builder.getTitle() != null && builder.getAuthors() != null) {
            return builder.getObject();
        } else {
            return null;
        }

    }

    /**
     * ExtractYear Extracts the year contained in a date
     * 
     * @param date
     *            String that contains the year
     * @return year String containing only the year
     */
    public String ExtractYear(String date) {

        int c = 0;// number of character
        String year = "";// string with the year
        String t = "";// temporary string

        for (int i = 0; i < date.length(); i++) {
            if (isInt(String.valueOf(date.charAt(i)))) {

                c++;
                t = t + String.valueOf(date.charAt(i));

                if (t.length() == 4)
                    year = t;

                if (c > 4) {
                    c = 0;
                    t = "";
                    year = "";
                }

            } else {
                c = 0;
                t = "";
                if (year.length() == 4)
                    return year;
            }
        }

        return year;
    }

    /**
     * isInt Verifies if a string is an integer
     * 
     * @param s
     *            String to verify
     * @return true if the input is an integer
     * @return false if the input is not an integer
     */
    public boolean isInt(String s) {
        // call the method parseInt to verify that the string is an integer
        try {
            Integer.parseInt(s);
            return true;
        } catch (Exception e) {
            return false;
        }
    }

    /**
     * notFound Verifies if the query was not found
     * 
     * @param element
     *            Element that contains the HTML element to analyze
     * @return true if the query was not found
     * @return false if the query was found
     */
    public boolean notFound(Element element) {
        // Take all the elements with tag "font"
        for (Element c : element.getAllElements(HTMLElementName.FONT)) {
            // attribute size=+1
            Attribute Attr3 = c.getStartTag().getAttributes().get("size");
            if (Attr3 != null) {
                if (Attr3.getValue().equals("+1")) {
                    // verify if there's the string "was not found"
                    if (c.toString().contains("was not found")) {
                        return true;
                    }
                }
            }
        }
        return false;
    }

}