org.bibsonomy.scraper.url.kde.ieee.IEEEXploreJournalProceedingsScraper.java Source code

Introduction

Here is the source code for org.bibsonomy.scraper.url.kde.ieee.IEEEXploreJournalProceedingsScraper.java
Source

/**
 *
 *  BibSonomy-Scraper - Web page scrapers returning BibTeX for BibSonomy.
 *
 *  Copyright (C) 2006 - 2011 Knowledge & Data Engineering Group,
 *                            University of Kassel, Germany
 *                            http://www.kde.cs.uni-kassel.de/
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

package org.bibsonomy.scraper.url.kde.ieee;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collections;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bibsonomy.model.util.BibTexUtils;
import org.bibsonomy.scraper.AbstractUrlScraper;
import org.bibsonomy.scraper.ScrapingContext;
import org.bibsonomy.scraper.Tuple;
import org.bibsonomy.scraper.exceptions.InternalFailureException;
import org.bibsonomy.scraper.exceptions.ScrapingException;
import org.bibsonomy.util.WebUtils;
import org.bibsonomy.util.XmlUtils;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/** Scraper for journals from IEEE Explore.
 * @author rja
 *
 */
public class IEEEXploreJournalProceedingsScraper extends AbstractUrlScraper {
    private static final String SITE_NAME = "IEEEXplore Journals";
    private static final String SITE_URL = "http://ieeexplore.ieee.org/";
    private static final String info = "This scraper creates a BibTeX entry for the journals and proceedings at "
            + href(SITE_URL, SITE_NAME) + ".";

    private static final Log log = LogFactory.getLog(IEEEXploreJournalProceedingsScraper.class);
    private static final String IEEE_HOST = "ieeexplore.ieee.org";
    private static final String IEEE_HOST_NAME = SITE_URL;
    private static final String IEEE_PATH = "xpl";
    private static final String IEEE_JOURNAL = "@article";
    private static final String IEEE_PROCEEDINGS = "@proceedings";
    private static final String IEEE_INPROCEEDINGS = "@inproceedings";

    private static final String CONST_DATE = "Publication Date: ";
    private static final String CONST_VOLUME = "Volume: ";
    private static final String CONST_PAGES = "On page(s): ";
    private static final String CONST_BOOKTITLE = "This paper appears in: ";

    private static final Pattern pattern = Pattern.compile("arnumber=([^&]*)");
    private static final Pattern pattern2 = Pattern.compile("chklist=([^%]*)");

    private static final List<Tuple<Pattern, Pattern>> patterns = Collections
            .singletonList(new Tuple<Pattern, Pattern>(Pattern.compile(".*" + IEEE_HOST),
                    Pattern.compile("/" + IEEE_PATH + ".*")));

    protected boolean scrapeInternal(ScrapingContext sc) throws ScrapingException {
        //FIXME: this should into the path pattern!
        if (sc.getUrl().toString().indexOf("punumber") == -1) {
            sc.setScraper(this);

            String id = null;
            Matcher matcher = pattern.matcher(sc.getUrl().toString());
            if (matcher.find())
                id = matcher.group(1);

            matcher = pattern2.matcher(sc.getUrl().toString());
            if (id == null && matcher.find())
                id = matcher.group(1);

            if (id != null) {
                String downUrl = "http://ieeexplore.ieee.org/xpl/downloadCitations?citations-format=citation-abstract&download-format=download-bibtex&fromPageName=abstract&recordIds="
                        + id;
                String bibtex = null;
                try {
                    bibtex = WebUtils.getContentAsString(new URL(downUrl));
                } catch (MalformedURLException ex) {
                    throw new InternalFailureException(ex);
                } catch (IOException ex) {
                    throw new InternalFailureException(ex);
                }

                if (bibtex != null) {
                    // clean up
                    bibtex = bibtex.replace("<br>", "");

                    // append url
                    bibtex = BibTexUtils.addFieldIfNotContained(bibtex, "url", sc.getUrl().toString());

                    // add downloaded bibtex to result 
                    sc.setBibtexResult(bibtex);
                    return true;

                } else {
                    log.debug(
                            "IEEEXploreJournalProceedingsScraper: direct bibtex download failed. Use JTidy to get bibliographic data.");
                    sc.setBibtexResult(ieeeJournalProceedingsScrape(sc));
                    return true;

                }
            } else {
                log.debug("IEEEXploreJournalProceedingsScraper use JTidy to get Bibtex from "
                        + sc.getUrl().toString());
                sc.setBibtexResult(ieeeJournalProceedingsScrape(sc));
                return true;
            }
        }
        return false;
    }

    public String getInfo() {
        return info;
    }

    public String ieeeJournalProceedingsScrape(ScrapingContext sc) throws ScrapingException {

        try {
            //-- init all NodeLists and Node
            NodeList pres = null;
            Node currNode = null;
            NodeList temp = null;

            //-- init Strings for bibtex entries
            // month uncased because of multiple date types
            String type = "";
            String url = sc.getUrl().toString();
            String author = "";
            String year = "";
            String abstr = "";
            String title = "";
            String booktitle = "";
            String volume = null;
            String pages = null;
            String issn = null;
            String isbn = null;
            String doi = null;

            String authors[] = null;
            String tempAuthors = null;

            //-- get the html doc and parse the DOM
            final Document document = XmlUtils.getDOM(sc.getPageContent());

            //get the abstract block
            String ident1 = "<span class=\"sectionHeaders\">Abstract</span><br>";
            String ident2 = "<td class=\"bodyCopyGrey\"><p class=\"bodyCopyGreySpaced\"><strong>Index";
            if (sc.getPageContent().indexOf(ident1) != -1 && sc.getPageContent().indexOf(ident2) != -1) {
                abstr = sc.getPageContent()
                        .substring(sc.getPageContent().indexOf(ident1) + ident1.length(),
                                sc.getPageContent().indexOf(ident2))
                        .replaceAll("\\s\\s+", "").replaceAll("(<.+?>)", "").trim();
            }

            /*-- Get the title of the journal --
             * Iterate through all spans
             */
            pres = null;
            pres = document.getElementsByTagName("span"); //get all <span>-Tags
            for (int i = 0; i < pres.getLength(); i++) {
                Node curr = pres.item(i);
                Element g = (Element) curr;
                Attr own = g.getAttributeNode("class");

                if ("headNavBlueXLarge2".equals(own.getValue())) {
                    title = curr.getFirstChild().getNodeValue();
                    temp = pres.item(i + 1).getChildNodes();

                    if (!"".equals(temp.item(0).getNodeValue())) {
                        tempAuthors = temp.item(0).getNodeValue();

                        if ("\u00A0\u00A0".equals(tempAuthors)) {
                            authors = new String[] { "N/A" };
                        } else {
                            authors = tempAuthors.split("\u00A0\u00A0");
                        }
                    }
                    break;
                }
            }

            /*-- Get the global infomation like publication date, number of pages ... --
             * iterate through all p's stop at "This paper appears in:" because its
             * available in all journals.
             * Save Nodelist and break the loops.
             * */
            pres = null;
            NodeList match = null;
            pres = document.getElementsByTagName("p"); //get all <p>-Tags
            for (int i = 0; i < pres.getLength(); i++) {
                currNode = pres.item(i);
                temp = currNode.getChildNodes();
                //iterate through childs to find "Publication Date:"
                for (int j = 0; j < temp.getLength(); j++) {
                    if (temp.item(j).getNodeValue().indexOf(CONST_BOOKTITLE) != -1) {
                        if (!"".equals(temp.item(1).getFirstChild().getFirstChild().getNodeValue())) {
                            booktitle = temp.item(1).getFirstChild().getFirstChild().getNodeValue();
                        }
                        match = temp;
                        break;
                    }
                }
            }
            //get the different childs of the founded p-tag
            for (int i = 0; i < match.getLength(); i++) {
                if (!"".equals(match.item(i).getNodeValue())) {
                    String infoMatches = null;
                    if (match.item(i).getNodeValue().indexOf(CONST_DATE) != -1) {
                        //extract the year
                        infoMatches = match.item(i).getNodeValue().substring(CONST_DATE.length());
                        StringTokenizer tokenizer = new StringTokenizer(infoMatches);
                        String yearPattern = "\\d{4}";
                        Pattern yearP = Pattern.compile(yearPattern);

                        while (tokenizer.hasMoreTokens()) {
                            String token = tokenizer.nextToken();
                            Matcher matcherYear = yearP.matcher(token);
                            if (matcherYear.matches()) {
                                year = token;
                            }
                        }
                    }
                    if (volume == null && match.item(i).getNodeValue().indexOf(CONST_VOLUME) != -1) {
                        infoMatches = match.item(i).getNodeValue();
                        volume = infoMatches.substring(infoMatches.indexOf(CONST_VOLUME) + CONST_VOLUME.length(),
                                infoMatches.indexOf(",")).trim();
                    }
                    if (pages == null && match.item(i).getNodeValue().indexOf(CONST_PAGES) != -1) {
                        infoMatches = match.item(i).getNodeValue();
                        pages = infoMatches.substring(infoMatches.indexOf(CONST_PAGES) + CONST_PAGES.length())
                                .trim();
                    }
                    if (issn == null)
                        issn = getField(match, i, "ISSN: ");
                    if (isbn == null)
                        isbn = getField(match, i, "ISBN: ");
                    if (doi == null)
                        doi = getField(match, i, "Digital Object Identifier: ");
                }
            }

            //-- set bibtex type @article for journals & @proceeding for proceedings
            if ((isbn == null || isbn.trim().equals("")) && issn != null && !issn.trim().equals("")) {
                type = IEEE_JOURNAL;
            } else {
                if (title.equals(booktitle)) {
                    type = IEEE_PROCEEDINGS;
                } else {
                    type = IEEE_INPROCEEDINGS;
                }
            }

            //-- get all authors out of the arraylist and prepare them to bibtex entry "author"
            for (int i = 0; i < authors.length; i++) {
                if (i == authors.length - 1) {
                    author += authors[i].trim();
                } else {
                    author += authors[i].trim() + " and ";
                }
            }

            //-- kill spaces and add the year to bibtexkey
            //- replace all special chars to avaoid crashes through bibtexkey
            StringBuffer b = new StringBuffer(type + "{" + getName(authors[0]) + ":" + year + ",");
            appendBibtexField(b, "author", author);
            appendBibtexField(b, "abstract", abstr);

            appendBibtexField(b, "title", title);
            appendBibtexField(b, "booktitle", booktitle);
            appendBibtexField(b, "url", url);
            appendBibtexField(b, "year", year);
            appendBibtexField(b, "isbn", isbn);
            appendBibtexField(b, "issn", issn);
            appendBibtexField(b, "doi", doi);
            appendBibtexField(b, "volume", volume);
            appendBibtexField(b, "pages", pages);
            b.append("}");

            return b.toString();

        } catch (Exception e) {
            throw new InternalFailureException(e);
        }
    }

    private String getName(String author) {
        if (author != null) {
            final int indexOfComma = author.indexOf(",");
            if (indexOfComma != -1) {
                return author.substring(0, indexOfComma).replaceAll("[^a-zA-Z]", "");
            } else {
                return author.replaceAll("[^a-zA-Z]", "");
            }
        }
        return null;
    }

    private String getField(NodeList match, int i, final String field) {
        final String nodeValue = match.item(i).getNodeValue();
        if (nodeValue.indexOf(field) != -1) {
            return nodeValue.substring(field.length()).trim();
        }
        return null;
    }

    private void appendBibtexField(StringBuffer b, String field, String value) {
        if (value != null) {
            b.append(field + " = {" + value + "},");
        }
    }

    public List<Tuple<Pattern, Pattern>> getUrlPatterns() {
        return patterns;
    }

    public String getSupportedSiteName() {
        return SITE_NAME;
    }

    public String getSupportedSiteURL() {
        return SITE_URL;
    }

}