ie.nuim.cs.dri.metadata.WebSearch.java Source code

Introduction

Here is the source code for ie.nuim.cs.dri.metadata.WebSearch.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package ie.nuim.cs.dri.metadata;

import ie.nuim.cs.dri.repository.Configuration;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import ie.nuim.cs.dri.repository.ROS;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/**
 *
 * @author Yalemisew
 */
public class WebSearch {

    /**
     * Searches the Scopus database
     *
     * @param title the title of the research object
     * @return returns an xml response of Scopus
     */
    public String searchScopus(String title) {
        String xmlString = "";
        Configuration config = new Configuration();
        String scopusAPI = config.getScopusAPIKey();

        try {
            // create default HTTP Client
            CloseableHttpClient httpClient = HttpClientBuilder.create().build();

            // Create new getRequest with below mentioned URL
            String query = title;
            //uncomment the following line if you want to make an exact search, very restrictive
            //query = buildScopusSExactearchTitle(query);
            query = buildScopusSearchTitle(query);
            HttpGet getRequest = new HttpGet(
                    "http://api.elsevier.com/content/search/index:scopus?query=" + query + "&count=50");
            //System.out.println("http://api.elsevier.com/content/search/index:scopus?query=" + query + "&count=50");

            // Add additional header to getRequest which accepts application/xml data
            getRequest.addHeader("X-ELS-APIKey", scopusAPI);
            getRequest.addHeader("X-ELS-ResourceVersion", "XOCS");
            getRequest.addHeader("accept", "application/xml");

            // Execute your request and catch response
            HttpResponse response = httpClient.execute(getRequest);

            // Check for HTTP response code: 200 = success
            if (response.getStatusLine().getStatusCode() != 200) {
                throw new RuntimeException(
                        "Failed : HTTP error code : " + response.getStatusLine().getStatusCode());
            }

            // Get-Capture Complete application/xml body response
            BufferedReader br = new BufferedReader(new InputStreamReader((response.getEntity().getContent())));
            String output;
            while ((output = br.readLine()) != null) {
                xmlString += output;

            }
            httpClient.close();

        } catch (ClientProtocolException e) {
        } catch (IOException e) {
        }
        //System.out.println("Scopus Search Result=\n" + xmlString);
        return xmlString;
    }

    /**
     *
     * @param title the title of the ROS
     * @return the search result of WOS, xml formatted data
     */
    public String searchWOS(String title) {
        String xmlString = "";
        try {
            // create default HTTP Client
            CloseableHttpClient httpClient = HttpClientBuilder.create().build();

            // Create new getRequest with below mentioned URL
            String query = title;
            query = buildScopusSearchTitle(query);
            HttpGet getAuthRequest = new HttpGet(
                    "http://search.webofknowledge.com/esti/wokmws/ws/WOKMWSAuthenticate");
            HttpResponse response = httpClient.execute(getAuthRequest);

            HttpGet getSearchRequest = new HttpGet(
                    "http://search.webofknowledge.com/esti/wokmws/ws/WOKMWSAuthenticate");

            // Add additional header to getRequest which accepts application/xml data
            getSearchRequest.addHeader("SID", response.toString());

            // Execute your request and catch response
            HttpResponse searchResponse = httpClient.execute(getSearchRequest);

            // Check for HTTP response code: 200 = success
            if (response.getStatusLine().getStatusCode() != 200) {
                throw new RuntimeException(
                        "Failed : HTTP error code : " + response.getStatusLine().getStatusCode());
            }

            // Get-Capture Complete application/xml body response
            BufferedReader br = new BufferedReader(new InputStreamReader((response.getEntity().getContent())));

            //System.out.println("============Output:============");
            String output;
            // Simply iterate through XML response and show on console.
            while ((output = br.readLine()) != null) {
                xmlString += output;

            }

            //httpClient.getConnectionManager().shutdown();
            httpClient.close();

        } catch (ClientProtocolException e) {
        } catch (IOException e) {
        }
        //System.out.println("WOS Search Result=" + xmlString);
        return xmlString;
    }

    /**
     *
     * @param title the title of the ROS
     */
    public void searchGoogle(String title) {
        String searchTitle = buildGoogleSearchTitle(title);
        boolean found = false;
        String publication = "";
        String publicationType = "";
        int citationCount = -1;
        String url = "http://scholar.google.com/scholar?" + searchTitle;
        Document doc = Jsoup.parse(getGS());
        Elements aElement = doc.getElementsByTag("h3");
        System.out.println("=====searching google=======");
        for (Element e : aElement) {
            Elements bElement = e.getElementsByTag("a");
            for (Element f : bElement) {
                System.out.println(f.text() + "\t" + title);

                if (title.equalsIgnoreCase(f.text())) {
                    found = true;
                    break;
                }
            }
            // System.out.println(e);

        }
        if (found == true) {
            Elements pElement = doc.getElementsByTag("div");
            for (Element p : pElement) {
                Elements pubElement = p.getElementsByClass("gs_a");
                for (Element pub : pubElement) {
                    System.out.println(pub);
                }

            }
            for (Element p : pElement) {
                Elements pubElement = p.getElementsByClass("gs_fl");
                for (Element pub : pubElement) {
                    System.out.println(pub);
                }

            }

        }
    }

    /**
     *
     * @param title
     * @return
     */
    public String searchCiteSeer(String title) {
        String htmlString = "";
        try {
            String url = "http://citeseerx.ist.psu.edu/search?" + buildCiteSeerSearchTitle(title);
            Document doc = Jsoup.connect(url).timeout(30000).get();
            htmlString = doc.toString();
            // Document doc = Jsoup.parse(getGS());
        } catch (IOException ex) {
            // System.out.println("The server took longer than usual to respond, please try again later on. "+ ex.getLocalizedMessage() +"\n"+ex.getCause());
            Logger.getLogger(WebSearch.class.getName()).log(Level.SEVERE, null, ex);
        }
        return htmlString;
    }

    /**
     *
     * @param title
     * @return
     */
    public static String buildCiteSeerSearchTitle(String title) {
        //q = "title%3A%28{1}%29+author%3%28{0}%29&submit=Search&sort=cite&t=doc"
        String titleTokens[] = title.split(" ");
        String searchTitle = "q=title%3A%28";
        for (int i = 0; i < titleTokens.length - 1; i++) {
            searchTitle += titleTokens[i] + "+";
        }
        searchTitle += titleTokens[titleTokens.length - 1] + "%29&submit=Search&sort=cite&t=doc";
        return searchTitle;

    }

    /**
     *
     * @param title
     * @return
     */
    public static String buildGoogleSearchTitle(String title) {
        String titleTokens[] = title.split(" ");
        String searchTitle = "q=";
        for (int i = 0; i < titleTokens.length - 1; i++) {
            searchTitle += titleTokens[i] + "+";
        }
        searchTitle += titleTokens[titleTokens.length - 1] + "%22%29";
        // System.out.println("The search title is :" + searchTitle);
        return searchTitle;

    }

    /**
     *
     * @param title
     * @return
     */
    public static String buildScopusSearchTitle(String title) {
        String titleTokens[] = title.split(" ");
        String searchTitle = "TITLE%28%22";
        for (int i = 0; i < titleTokens.length - 1; i++) {
            searchTitle += titleTokens[i] + "%20";
        }
        searchTitle += titleTokens[titleTokens.length - 1] + "%22%29";
        // System.out.println("The search title is :" + searchTitle);
        return searchTitle;

    }

    /**
     *
     * @param title
     * @return
     */
    public static String buildScopusExactSearchTitle(String title) {

        String titleTokens[] = title.split(" ");
        String searchTitle = "TITLE%7B";
        for (int i = 0; i < titleTokens.length - 1; i++) {
            searchTitle += titleTokens[i].trim() + "%20";
        }
        searchTitle += titleTokens[titleTokens.length - 1] + "%7D";
        System.out.println("The search title is :" + searchTitle);
        return searchTitle;

    }

    /**
     *
     * @param title
     * @return
     */
    public static String buildNatureSearchTitle(String title) {
        String titleTokens[] = title.split(" ");
        String searchTitle = "";
        for (int i = 0; i < titleTokens.length - 1; i++) {
            searchTitle += titleTokens[i] + "+";
        }
        // searchTitle += titleTokens[titleTokens.length - 1] + "%22%29";
        // System.out.println("The search title is :" + searchTitle);
        return searchTitle;

    }

    /**
     *
     * @return
     */
    public String getGS() {
        BufferedReader br = null;
        String sCurrentLine;
        String fullText = "";
        try {
            FileReader jrcFileReader = new FileReader("C:\\DRINVENTOR_PROJECT\\corpus\\test.htm");
            br = new BufferedReader(jrcFileReader);
            StringBuilder sb = new StringBuilder();

            while ((sCurrentLine = br.readLine()) != null) {
                sb.append(sCurrentLine);
                //System.out.println(sb.toString());
                sb.append(System.lineSeparator());

            }
            fullText = sb.toString();
        } catch (IOException e) {
        } finally {
            try {
                if (br != null) {
                    br.close();
                }
            } catch (IOException ex) {
            }
        }
        //System.out.println("Full text from gs "+ fullText);
        return fullText;
    }

    /**
     * extracts the field from a given xml string
     *
     * @param xmlTag the name of the tag used in the xml file (such as title,
     * author etc)
     * @param xmlString the xml string
     * @return the extracted value of the tag
     */
    public static String extractField(String xmlTag, String xmlString) {
        String extractedField = "";
        try {
            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
            org.w3c.dom.Document doc = dBuilder.parse(new InputSource(new StringReader(xmlString)));
            doc.getDocumentElement().normalize();
            String rootNodeName = doc.getDocumentElement().getNodeName();
            NodeList nList;
            nList = doc.getElementsByTagName(xmlTag);
            for (int temp = 0; temp < nList.getLength(); temp++) {
                Node nNode = nList.item(temp);
                //System.out.print("\nCurrent Element:" + nNode.getNodeName());
                if (nNode.getNodeType() == Node.ELEMENT_NODE) {
                    org.w3c.dom.Element eElement = (org.w3c.dom.Element) nNode;
                    extractedField = eElement.getTextContent();
                }
            }
        } catch (ParserConfigurationException | SAXException | IOException ex) {
            Logger.getLogger(MetadataExtractor.class.getName()).log(Level.SEVERE, null, ex);
        }
        extractedField = extractedField.replace(",", ";");
        return extractedField;
    }

    /**
     * extracts multiple fields from the xml string and assign the values to a
     * ROS instance. It iterates through all instances and
     *
     * @param xmlString the xml String
     * @return an array of ROS
     */
    public static ROS[] extractMultiField(String xmlString) {
        ROS[] ros = null;
        try {
            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
            org.w3c.dom.Document doc = dBuilder.parse(new InputSource(new StringReader(xmlString)));
            doc.getDocumentElement().normalize();
            String rootNodeName = doc.getDocumentElement().getNodeName();
            NodeList nList;
            nList = doc.getElementsByTagName("entry");
            ros = new ROS[nList.getLength()];
            for (int temp = 0; temp < nList.getLength(); temp++) {
                Node nNode = nList.item(temp);
                ros[temp] = new ROS();
                if (nNode.getNodeType() == Node.ELEMENT_NODE) {
                    org.w3c.dom.Element eElement = (org.w3c.dom.Element) nNode;
                    NodeList childNodes = nNode.getChildNodes();
                    for (int i = 0; i < childNodes.getLength(); i++) {
                        Node childNode = childNodes.item(i);
                        if (childNode.getNodeType() == Node.ELEMENT_NODE) {
                            org.w3c.dom.Element childElement = (org.w3c.dom.Element) childNode;
                            //System.out.println(childNode.getNodeName() +"\t"+ childElement.getTextContent());
                            String nodeValue = childElement.getNodeName();
                            String value = childElement.getTextContent();

                            if (nodeValue == "prism:doi") {
                                ros[temp].setDoi(value.trim());
                            } else if (nodeValue == "dc:title") {
                                ros[temp].setArticleTitle(value.trim());
                            } else if (nodeValue == "prism:publicationName") {
                                ros[temp].setPublicationName(value.trim());
                            } else if (nodeValue == "prism:issn") {
                                ros[temp].setIssn(value.trim());
                            } else if (nodeValue == "affiliation") {
                                ros[temp].setAffiliation1(
                                        childElement.getElementsByTagName("affilname").item(0).getTextContent());
                            } else if (nodeValue == "prism:coverDisplayDate") {
                                ros[temp].setYear(extractYearFromYearString(value));
                            } else if (nodeValue == "citedby-count") {
                                ros[temp].setCitedByCount(Integer.parseInt(value.trim()));
                            } else if (nodeValue == "prism:aggregationType") {
                                ros[temp].setPublicationType(value.trim());

                            } else if (nodeValue == "prism:volume") {
                                ros[temp].setVolume(value.trim());
                            }

                        }

                    }

                }
                // ros[temp].showMetaData();
            }
        } catch (ParserConfigurationException | SAXException | IOException ex) {
            Logger.getLogger(MetadataExtractor.class.getName()).log(Level.SEVERE, null, ex);
        }

        return ros;
    }

    /**
     * this method extracts the publication which is a journal and sum up the
     * citation from the other sources We choose a journal paper if there exist
     * one, otherwise the first ros. We cross checked the citation and summing
     * up all the citations is a logical approach as it doesn't go above the
     * Google scholar's citation count (for example v1 20 cites v2 17 cites and
     * the paper earn 37 cites.)
     *
     * @param allRos an array representing the different version of the same ROS
     * appearing in conference, journal book series etc
     * @param title the title of the research object skeleton
     * @return
     */
    public static ROS extractBestROS(ROS[] allRos, String title) {
        boolean isJournal = false;
        String properPubName = "";
        int targetRos = -1;
        int jorTargetRos = -1;
        int totalCitation = 0;
        for (int rosCount = 0; rosCount < allRos.length; rosCount++) {
            //checks if the article title matches and if the entry is a journal
            if (allRos[rosCount].getArticleTitle().equalsIgnoreCase(title)) {
                if (allRos[rosCount].getPublicationType().equalsIgnoreCase("journal")) {
                    isJournal = true;
                    jorTargetRos = rosCount;
                } else {
                    targetRos = rosCount;
                }
                // System.out.println("found the title");
                targetRos = rosCount;

                totalCitation += allRos[rosCount].getCitedByCount();

                //checks if the publicaiton  name is an official name, if it contains issn number, it is potentially a proper publication name
                if ((!allRos[rosCount].getIssn().equalsIgnoreCase(""))
                        && (!allRos[rosCount].getVolume().equalsIgnoreCase(""))) {
                    //System.out.println("**************\n\nbest is found at "+rosCount+"\n\n***********");
                    properPubName = allRos[rosCount].getPublicationName();
                }
            }
        }

        //assign the proper publication name if it is available
        if (properPubName != "" && jorTargetRos != -1) {
            allRos[jorTargetRos].setCitedByCount(totalCitation);
        } else if (properPubName != "" && targetRos != -1) {
            allRos[targetRos].setPublicationName(properPubName);
        }

        if (isJournal == true && jorTargetRos != -1) {
            allRos[jorTargetRos].setCitedByCount(totalCitation);
            return allRos[jorTargetRos];
        } else if (targetRos != -1) {
            allRos[targetRos].setCitedByCount(totalCitation);
            return allRos[targetRos];
        } else {
            ROS emptyRos = new ROS();
            emptyRos.setArticleTitle(title);
            return emptyRos;
        }

    }

    /**
     * extracts the meta data returned from the scopus search method. uses the
     * multifield extraction method
     *
     * @param xmlString the xml string returned from the scopus and other
     * searches
     * @param title the title of the ros
     * @return
     * @throws Exception
     */
    public static ROS extractScopusMetadata(String xmlString, String title) throws Exception {

        int totalResults = 0;
        ROS refinedROS = new ROS();
        totalResults = Integer.parseInt(extractField("opensearch:totalResults", xmlString));
        if (totalResults == 0) {
            System.out.println("The search in Scopus returned no result. Trying other sources");
        } else if (totalResults > 0) {
            ROS[] ross = extractMultiField(xmlString);

            if (ross.length > 1) {
                refinedROS = extractBestROS(ross, title);
            } else {
                refinedROS = ross[0];
            }

        }
        return refinedROS;

    }

    /**
     *
     * @param xmlString
     * @param title
     * @return
     * @throws Exception
     */
    public static ROS extractNatureMetadata(String xmlString, String title) throws Exception {
        System.out.println("xml String from nature" + xmlString);
        ROS ros = new ROS();
        return ros;
    }

    /**
     *
     * @param xmlString
     * @param title
     * @return
     * @throws Exception
     */
    public static ROS extractCiteSeerMetadata(String xmlString, String title) throws Exception {
        //System.out.println("CiteSeer search returned:\n"+xmlString);
        Document doc = Jsoup.parse(xmlString);
        ROS ros = new ROS();
        String articleTitle = "";
        int pubYear = 0;

        Elements resultElements = doc.getElementsByClass("result");

        for (Element result : resultElements) {
            Elements titleElement = result.getElementsByClass("doc_details");
            System.out.println(titleElement.text() + "\t" + title);
            if (!titleElement.text().equalsIgnoreCase(title)) {
                break;
            } else {
                ros.setArticleTitle(title);
                Elements authorElement = result.getElementsByClass("pubinfo");
                //authors=authorElement.text();
                Elements yearElement = result.getElementsByClass("pubyear");
                String yearStr = yearElement.text().replace(", ", "");
                int year = (yearStr.length() > 0 ? Integer.parseInt(yearStr) : 0);
                System.out.println("year:" + yearElement.text().replace(", ", ""));
                ros.setYear(year);
                Elements citeElement = result.getElementsByClass("citation");
                String[] citedBy = citeElement.text().split(" ");

                int citeby = (citedBy.length > 2 ? Integer.parseInt(citedBy[2]) : -1);
                ros.setCitedByCount(citeby);

                Elements publicationElement = result.getElementsByClass("pubvenue");
                String pub = publicationElement.text().replace("- ", "").toLowerCase();
                if (pub.contains("journal")) {
                    ros.setPublicationType("Journal");
                } else if (pub.contains("conference") || pub.contains("conf.") || pub.contains("proc.")) {
                    ros.setPublicationType("Conference");
                } else {
                    ros.setPublicationType("");
                }
                ros.setPublicationName(pub);

            }

        }
        // }

        return ros;

    }

    /**
     * extracts the year string from the year field. Some times the xml tag
     * contains 03 or 2003 99 or 1999. This method extracts the year and if it
     * is >20 changes it to 20th otherwise to 21th century
     *
     * @param yearString
     * @return
     */
    private static int extractYearFromYearString(String yearString) {
        int year = 0;
        String ss[] = yearString.split(" ");
        for (String str : ss) {
            Matcher m = Pattern.compile("(19|20)\\d\\d|(dd)").matcher(str);
            if (m.find()) {
                year = Integer.parseInt(str);
            }
        }
        if (year > 1900) {
            return year;
        } else {
            if (year < 20) {
                return 2000 + year;
            } else {
                return 1900 + year;
            }
        }

    }

    /**
     * uses a regular expression to extract the year part from the date
     *
     * @param doi the doi extracted from the xml file
     * @return
     */
    private static String extractYearFromDOI(String doi) {
        String parts[] = doi.split("/");
        String yearString = parts[4];
        String year = "";
        String ss[] = yearString.split("\\.");
        for (String str : ss) {
            Matcher m = Pattern.compile("(19|20)\\d\\d").matcher(str);
            if (m.find()) {
                year = str;
            }
        }
        return year;
    }

    /**
     *
     * @param doi
     * @param ti
     * @param au
     * @param pubYear
     * @param xmlFile
     */
    public static void createROS(String doi, String ti, String au, String pubYear, String xmlFile) {
        try {
            DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder docBuilder = null;
            try {
                docBuilder = docFactory.newDocumentBuilder();

            } catch (ParserConfigurationException ex) {
                Logger.getLogger(MetadataExtractor.class.getName()).log(Level.SEVERE, null, ex);
            }
            org.w3c.dom.Document rosDoc = docBuilder.newDocument();
            org.w3c.dom.Element rootElement = rosDoc.createElement("ROS");
            rosDoc.appendChild((rootElement));
            org.w3c.dom.Element DOI = rosDoc.createElement("DOI");
            org.w3c.dom.Element title = rosDoc.createElement("Title");
            org.w3c.dom.Element authors = rosDoc.createElement("Authors");
            org.w3c.dom.Element institution = rosDoc.createElement("Institution");
            org.w3c.dom.Element year = rosDoc.createElement("Year");
            org.w3c.dom.Element publication = rosDoc.createElement("Publication");
            org.w3c.dom.Element conference = rosDoc.createElement("Conference");

            rootElement.appendChild(DOI);
            rootElement.appendChild(title);
            rootElement.appendChild(authors);
            rootElement.appendChild(institution);
            rootElement.appendChild(year);
            rootElement.appendChild(publication);
            rootElement.appendChild(conference);

            DOI.appendChild(rosDoc.createTextNode(doi));
            title.appendChild(rosDoc.createTextNode(ti));
            authors.appendChild(rosDoc.createTextNode(au));
            year.appendChild(rosDoc.createTextNode(pubYear));

            TransformerFactory transformerFactory = TransformerFactory.newInstance();
            Transformer transformer = transformerFactory.newTransformer();
            DOMSource source = new DOMSource(rosDoc);
            xmlFile = xmlFile.replace(".xml", "MD.xml");
            System.out.println("Writing to:\n" + xmlFile);
            StreamResult result = new StreamResult(new File(xmlFile));
            transformer.transform(source, result);
            //result = new StreamResult(System.out);

        } catch (TransformerException ex) {
            Logger.getLogger(MetadataExtractor.class.getName()).log(Level.SEVERE, null, ex);
        }

    }
}