Java tutorial
/* * Copyright 2016 Florian Pollak * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.epop.dataprovider.pubmed; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPathExpressionException; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.Source; import org.apache.commons.lang.StringEscapeUtils; import org.apache.http.client.ClientProtocolException; import org.epop.dataprovider.DataProvider; import org.epop.dataprovider.HTMLPage; import org.epop.dataprovider.XMLPage; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import com.github.bfour.fpliteraturecollector.domain.Author; import com.github.bfour.fpliteraturecollector.domain.Link; import com.github.bfour.fpliteraturecollector.domain.Literature; import com.github.bfour.fpliteraturecollector.domain.Literature.LiteratureType; import com.github.bfour.fpliteraturecollector.domain.builders.AuthorBuilder; import com.github.bfour.fpliteraturecollector.domain.builders.LiteratureBuilder; public class PubMedSearch extends DataProvider { private static enum PubMedDisplayUnits { FIVE(5), TEN(10), TWENTY(20), FIFTY(50), ONE_HUNDRED(100), TWO_HUNDRED(200); private int maxDisplayedResults; PubMedDisplayUnits(int maxDisplayedResults) { this.maxDisplayedResults = maxDisplayedResults; } public int getMaxDisplayedResults() { return maxDisplayedResults; } public static PubMedDisplayUnits getDispmaxParamByDesiredResults(int desiredResultNum) { for (PubMedDisplayUnits unit : values()) { if (unit.getMaxDisplayedResults() <= desiredResultNum) return unit; } return null; } } Logger logger = Logger.getLogger(PubMedSearch.class.getCanonicalName()); private static final String PUBMED_SEARCH = "http://www.ncbi.nlm.nih.gov/pubmed/"; private static final long DELAY = 18611; private static final int SEARCH_STEP = 10; private static final String ID_PATTERN_STRING = ".*/pubmed/(\\d+).*"; private static Pattern ID_PATTERN; @Override public String getDescription() { return "PubMed"; } @Override protected Reader getHTMLDoc(String htmlParams, int pageTurnLimit, boolean initialWait) { try { if (initialWait) Thread.sleep(DELAY); // TODO (low) implemented cleaner solution (desired num. of results // in AtomicRequest instead of page turns) PubMedDisplayUnits desiredResultUnit = PubMedDisplayUnits .getDispmaxParamByDesiredResults(pageTurnLimit * SEARCH_STEP); String uriString = PUBMED_SEARCH + "?" + htmlParams; if (desiredResultUnit != null) uriString += "&dispmax=" + desiredResultUnit; URI uri = new URI(uriString); HTMLPage page = new HTMLPage(uri); return new StringReader(page.getRawCode()); } catch (URISyntaxException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ParserConfigurationException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; // // // return the result as string // return new StringReader(responseBody); } @Override protected List<Literature> parsePage(Reader page) { List<Literature> result = new ArrayList<Literature>(); try { Source source = new Source(page); for (Element element : source.getAllElements("class", "rslt", false)) { Literature lit = extractPaper(element); if (lit != null) result.add(lit); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return result; } private Literature extractPaper(Element element) { if (ID_PATTERN == null) ID_PATTERN = Pattern.compile(ID_PATTERN_STRING); LiteratureBuilder litBuilder = new LiteratureBuilder(); Element titleElem = element.getFirstElement("class", "title", false); if (titleElem != null) { Matcher matcher = ID_PATTERN.matcher(titleElem.getAllElements("a").get(0).getAttributeValue("href")); if (matcher.find()) { String id = matcher.group(1); // String entryLink = PUBMED_SEARCH + id; litBuilder.setPubmedID(id); try { articleToLiterature(id, litBuilder); } catch (URISyntaxException | IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (XPathExpressionException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ParserConfigurationException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (SAXException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else { // TODO error handling } } return litBuilder.getObject(); } private void articleToLiterature(String articleID, LiteratureBuilder builder) throws ClientProtocolException, IOException, ParserConfigurationException, URISyntaxException, XPathExpressionException, SAXException { String pubMedURL = "http://www.ncbi.nlm.nih.gov/pubmed/" + articleID; HTMLPage htmlPage = new HTMLPage(pubMedURL); HTMLPage page = new HTMLPage("http://www.ncbi.nlm.nih.gov/pubmed/" + articleID + "?report=xml&format=text"); String xmlCode = StringEscapeUtils.unescapeHtml(page.getStringByXPath("html/body/pre/text()")); XMLPage xmlPage = new XMLPage(xmlCode); // title try { builder.setTitle(xmlPage.getStringByXPath("//ArticleTitle/text()")); } catch (XPathExpressionException e) { // TODO Auto-generated catch block e.printStackTrace(); } // abstract text try { NodeList nodes = xmlPage.getNodeSetByXPath("//Abstract"); int i = 0; Node node; StringBuilder stringBuilder = new StringBuilder("<html>"); while ((node = nodes.item(i)) != null) { if (node.getAttributes().getNamedItem("Label") != null) { stringBuilder.append("<b>"); stringBuilder.append(node.getAttributes().getNamedItem("Label").getTextContent()); stringBuilder.append("</b><br/>"); } stringBuilder.append(node.getTextContent().trim().replaceAll("\t", "").replaceAll("\\s{2,}", " ")); stringBuilder.append("<br/>"); i++; } builder.setAbstractText(stringBuilder.toString() + "</html>"); } catch (XPathExpressionException e) { // TODO Auto-generated catch block e.printStackTrace(); } // authors try { Set<Author> authors = new HashSet<>(); int i = 0; while (true) { i++; String lastName = xmlPage.getStringByXPath("//AuthorList/Author[" + i + "]/ForeName"); String firstName = xmlPage.getStringByXPath("//AuthorList/Author[" + i + "]/LastName"); if (lastName.isEmpty() && firstName.isEmpty()) break; authors.add(new AuthorBuilder().setFirstName(firstName).setLastName(lastName).getObject()); } builder.setAuthors(authors); } catch (XPathExpressionException e) { // TODO Auto-generated catch block e.printStackTrace(); } // DOI try { String doi = xmlPage.getStringByXPath("//ArticleId[@IdType='doi']/text()"); if (doi != null && !doi.isEmpty()) builder.setDOI(doi); } catch (XPathExpressionException e) { // TODO Auto-generated catch block e.printStackTrace(); } // year try { String year = xmlPage.getStringByXPath("//DateRevised/Year/text()"); if (year != null && !year.isEmpty()) builder.setYear(Integer.parseInt(year)); } catch (XPathExpressionException e) { // TODO Auto-generated catch block e.printStackTrace(); } // publication context try { String journalTitle = xmlPage.getStringByXPath("//Article/Journal/Title/text()"); if (journalTitle != null && !journalTitle.isEmpty()) { builder.setPublicationContext(journalTitle); builder.setType(LiteratureType.JOURNAL_PAPER); } } catch (XPathExpressionException e) { // TODO Auto-generated catch block e.printStackTrace(); } // URLs try { NodeList nodes = htmlPage .getNodeSetByXPath(".//*[@id='maincontent']//div[@class='linkoutlist']/ul[1]/li"); int i = 0; Node node; Set<Link> webSiteLinks = new HashSet<>(); Set<Link> fullTextLinkSet = new HashSet<Link>(); // add pubmed page webSiteLinks.add(new Link("PubMed", pubMedURL)); while ((node = nodes.item(i)) != null) { if (node.getFirstChild().getAttributes().getNamedItem("href") == null) { i++; continue; } String linkText = node.getFirstChild().getTextContent(); String uri = node.getFirstChild().getAttributes().getNamedItem("href").getTextContent(); webSiteLinks.add(new Link(linkText, uri)); if (linkText.equals("PubMed Central")) { HTMLPage pmcPage = new HTMLPage(uri); Node pdfLinkNode = pmcPage .getNodeByXPath(".//*[@id='rightcolumn']//div[@class='format-menu']/ul/li[4]/a/@href"); String pdfLink = pdfLinkNode.getTextContent(); fullTextLinkSet.add(new Link(linkText, "http://www.ncbi.nlm.nih.gov" + pdfLink)); } i++; } builder.setWebsiteURLs(webSiteLinks); builder.setFulltextURLs(fullTextLinkSet); } catch (XPathExpressionException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }