Java tutorial
/*************************************************************************** * Copyright (C) 2009 Atlas of Living Australia * All Rights Reserved. * * The contents of this file are subject to the Mozilla Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at * * Software distributed under the License is distributed on an "AS * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or * implied. See the License for the specific language governing * rights and limitations under the License. ***************************************************************************/ package org.ala.harvester; import; import; import; import; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.ala.documentmapper.DocumentMapper; import org.ala.documentmapper.MappingUtils; import org.ala.repository.ParsedDocument; import org.ala.repository.Predicates; import org.ala.repository.Repository; import org.ala.repository.Triple; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.log4j.Logger; import org.springframework.beans.factory.config.BeanDefinition; import org.springframework.context.ApplicationContext; import org.springframework.context.annotation.Scope; import; import org.springframework.stereotype.Component; import org.w3c.dom.Document; /** * A Harvester class for Mosquitoes of Australia. * * @author Tommy Wang */ @Component("MaHarvester") @Scope(BeanDefinition.SCOPE_PROTOTYPE) public class MaHarvester implements Harvester { protected Logger logger = Logger.getLogger(MaHarvester.class); protected String endpoint; // private String eolGroupId; // private String MorphbankRestBaseUrl; // private String MorphbankApiKey; // private int recordsPerPage; protected Repository repository; protected int timeGap = 0; // private static final int RESULT_LIMIT = 6544; private static final int MA_INFOSOURCE_ID = 1066; protected String contentType = "text/xml"; protected final String baseUrl = ""; /** * Main method for testing this particular Harvester * * @param args */ public static void main(String[] args) throws Exception { String[] locations = { "classpath*:spring.xml" }; ApplicationContext context = new ClassPathXmlApplicationContext(locations); MaHarvester h = new MaHarvester(); Repository r = (Repository) context.getBean("repository"); h.setRepository(r); //set the connection params Map<String, String> connectParams = new HashMap<String, String>(); connectParams.put("endpoint", ""); h.setConnectionParams(connectParams); h.start(MA_INFOSOURCE_ID); } /** * @see org.ala.harvester.Harvester#setConnectionParams(java.util.Map) */ @Override public void setConnectionParams(Map<String, String> connectionParams) { this.endpoint = connectionParams.get("endpoint"); } @Override public void start(int infosourceId, int timeGap) throws Exception { this.timeGap = timeGap; start(infosourceId); } /** * @see org.ala.harvester.Harvester#start() */ @SuppressWarnings("unchecked") @Override public void start(int infosourceId) throws Exception { // TODO Auto-generated method stub Thread.sleep(timeGap); // Obtains the image listing on the page number specified. // Instance variable `currentResDom` will have new // DOM representation of the result. String indexStr = getIndexPageStr(); Pattern identifierPattern = Pattern.compile("(?:<a href=\")" + "([a-z#]{1,})" + "(?:\">)"); Matcher m = identifierPattern.matcher(indexStr); List<String> identifiers = new ArrayList<String>(); int speciesCounter = 0; int searchIdx = 0; // get all the family links while (m.find(searchIdx)) { int endIdx = m.end(); // String found = content.substring(startIdx, endIdx); String url =; String generatedUrl = this.endpoint + url; System.out.println("URL:" + generatedUrl); speciesCounter++; identifiers.add(generatedUrl); searchIdx = endIdx; } for (String identifier : identifiers) { processSingleImage(identifier, indexStr, infosourceId); } System.out.println(speciesCounter); } @SuppressWarnings("unchecked") public void processSingleImage(String identifier, String pageStr, int infosourceId) throws Exception { String[] bodyContentStrs = pageStr.split("<strong>The Mosquitoes</strong>"); String anchor = identifier.split("#")[1]; // System.out.println("anchor:" + anchor); Pattern distributionPattern = Pattern.compile("(?:\\()" + "([a-zA-Z \\,]{1,})" + "(?:\\))"); Pattern imagePattern = Pattern .compile("(?:\\(<a href=\")" + "(mosqphotos/[a-zA-Z_]{1,}\\.jpg)" + "(?:\">Photo)"); if (bodyContentStrs.length == 2) { pageStr = bodyContentStrs[1]; // String[] speciesStrs = pageStr.split("<a name=\"[a-z]{1,}\">"); String[] speciesStrs = pageStr.split("<p>"); int sciNameCounter = 0; if (speciesStrs.length > 1) { for (String speciesStr : speciesStrs) { if (!speciesStr.contains("</body>") && speciesStr.contains("<a name=\"" + anchor + "\">")) { // System.out.println(speciesStr); String[] components = speciesStr.split("<[/a-z]{1,}>"); String sciName = null; String description = null; String photoUrl = null; String distribution = null; boolean isSciNameSet = false; boolean isPhotoUrlSet = false; ParsedDocument pd = new ParsedDocument(); ParsedDocument imageDoc = null; Matcher matcher = null; if (components.length > 1) { for (int i = 0; i < components.length; i++) { if (!isSciNameSet && components[i] != null && components[i].length() > 0 && !components[i].contains("<")) { sciName = components[i]; sciNameCounter++; // System.out.println("Scientific Name: " + sciName); isSciNameSet = true; } components[i] = components[i].replaceAll("[\\s]{1,}", " "); System.out.println(components[i]); if (isSciNameSet && components[i] != null && components[i].startsWith(":")) { matcher = distributionPattern.matcher(components[i]); if (matcher.find(0)) { distribution =; components[i] = components[i].replaceFirst("\\([a-zA-Z \\,]{1,}\\)", ""); // System.out.println("Distribution: " + distribution); } matcher = imagePattern.matcher(components[i]); if (matcher.find(0)) { photoUrl =; components[i] = components[i].replaceFirst( "\\(<a href=\"mosqphotos/[a-zA-Z_]{1,}\\.jpg\">Photo", ""); photoUrl = baseUrl + photoUrl; isPhotoUrlSet = true; // System.out.println("Photo URL: " + photoUrl); } if (!components[i].contains("<") || !(components[i].contains("similar to") || components[i].contains("see ""))) { description = components[i]; description = description.replaceFirst(":", "").trim(); // description = description.split("<")[0]; if (description.lastIndexOf(".") > 0) { description = description.substring(0, description.lastIndexOf(".")); } // System.out.println("Description: " + description); } } if (isSciNameSet && !isPhotoUrlSet && components[i] != null && components[i].contains("jpg")) { matcher = imagePattern.matcher(components[i]); if (matcher.find(0)) { photoUrl =; photoUrl = baseUrl + photoUrl; isPhotoUrlSet = true; } } } } if (sciName != null && !"".equals(sciName)) { List<Triple<String, String, String>> triples = pd.getTriples(); Map<String, String> dcs = pd.getDublinCore(); String subject = MappingUtils.getSubject(); pd.setGuid(identifier); pd.setContent(getContent(identifier)); pd.setContentType(contentType); dcs.put(Predicates.DC_TITLE.toString(), sciName); dcs.put(Predicates.DC_IDENTIFIER.toString(), identifier); dcs.put(Predicates.COUNTRY.toString(), "Australia"); if (sciName != null && !"".equals(sciName)) { triples.add(new Triple(subject, Predicates.SCIENTIFIC_NAME.toString(), sciName)); } if (description != null && !"".equals(description)) { triples.add( new Triple(subject, Predicates.DESCRIPTIVE_TEXT.toString(), description)); } if (distribution != null && !"".equals(distribution)) { triples.add( new Triple(subject, Predicates.DISTRIBUTION_TEXT.toString(), distribution)); } if (photoUrl != null && !"".equals(photoUrl)) { triples.add(new Triple(subject, Predicates.IMAGE_URL.toString(), photoUrl)); imageDoc = MappingUtils.retrieveImageDocument(pd, photoUrl); } } if (pd != null && pd.getGuid() != null) { debugParsedDoc(pd); this.repository.storeDocument(infosourceId, pd); } if (imageDoc != null) { debugParsedDoc(imageDoc); this.repository.storeDocument(infosourceId, imageDoc); } } } } // System.out.println(sciNameCounter); } } private byte[] getContent(String url) throws Exception { String contentStr = null; // Create an instance of HttpClient. HttpClient client = new HttpClient(); // Create a method instance. GetMethod method = new GetMethod(url); // Provide custom retry handler is necessary method.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, "UTF-8"); method.getParams().setParameter(HttpMethodParams.HTTP_ELEMENT_CHARSET, "UTF-8"); method.getParams().setParameter(HttpMethodParams.HTTP_URI_CHARSET, "UTF-8"); try { int statusCode = client.executeMethod(method); if (statusCode != HttpStatus.SC_OK) { String errMsg = "HTTP GET to " + "`" + url + "`" + " returned non HTTP OK code. " + "Returned code " + statusCode + " and message " + method.getStatusLine() + "\n"; method.releaseConnection(); logger.error(errMsg); throw new Exception(errMsg); } InputStream responseStream = method.getResponseBodyAsStream(); contentStr = inputStream2String(responseStream); } catch (Exception domCreationErr) { throw new Exception(domCreationErr); } finally { // Release the connection. method.releaseConnection(); } return contentStr.getBytes(); } private String inputStream2String(InputStream is) throws IOException { BufferedReader in = new BufferedReader(new InputStreamReader(is)); StringBuffer buffer = new StringBuffer(); String line = ""; while ((line = in.readLine()) != null) { buffer.append(line); } return buffer.toString(); } private String getIndexPageStr() throws Exception { // Constructs the GET URL to search. // `woe_id` is Yahoo! Where On Earth ID. // Issue // // to find Australia. // `woe_id` here is country level code, as opposed to continent code. String urlToSearch = this.endpoint; System.out.println("Search URL: " + urlToSearch); /* * // Default parameters if not supplied. if (this.MorphbankApiKeySupplied * == false) { urlToSearch += "&" + "api_key=" + this.MorphbankApiKey; } if * (this.currentPageNumSupplied == false) { urlToSearch += "&" + "page=" * + this.currentPageNum; } if (this.recordsPerPageSupplied == false) { * urlToSearch += "&" + "per_page=" + this.recordsPerPage; } */ // System.out.println(urlToSearch); logger.debug("URL to search is: " + "`" + urlToSearch + "`" + "\n"); // Create an instance of HttpClient. HttpClient client = new HttpClient(); // Create a method instance. GetMethod method = new GetMethod(urlToSearch); // Provide custom retry handler is necessary method.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, "UTF-8"); method.getParams().setParameter(HttpMethodParams.HTTP_ELEMENT_CHARSET, "UTF-8"); method.getParams().setParameter(HttpMethodParams.HTTP_URI_CHARSET, "UTF-8"); try { int statusCode = client.executeMethod(method); if (statusCode != HttpStatus.SC_OK) { String errMsg = "HTTP GET to " + "`" + urlToSearch + "`" + " returned non HTTP OK code. Returned code " + statusCode + " and message " + method.getStatusLine() + "\n"; method.releaseConnection(); throw new Exception(errMsg); } // InputStream responseStream = method.getResponseBodyAsStream(); // // // Instantiates a DOM builder to create a DOM of the response. // DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); // DocumentBuilder builder = factory.newDocumentBuilder(); // return a parsed Document return method.getResponseBodyAsString(); } catch (Exception httpErr) { String errMsg = "HTTP GET to `" + urlToSearch + "` returned HTTP error."; throw new Exception(errMsg, httpErr); } finally { // Release the connection. method.releaseConnection(); } } // End of `getIndexPage` method. public void debugParsedDoc(ParsedDocument parsedDoc) { System.out.println("==============================================================================="); System.out.println("GUID: " + parsedDoc.getGuid()); System.out.println("Content-Type: " + parsedDoc.getContentType()); Map<String, String> dublinCore = parsedDoc.getDublinCore(); for (String key : dublinCore.keySet()) { System.out.println("DC: " + key + "\t" + dublinCore.get(key)); } List<Triple<String, String, String>> triples = parsedDoc.getTriples(); for (Triple<String, String, String> triple : triples) { System.out.println( "RDF: " + triple.getSubject() + "\t" + triple.getPredicate() + "\t" + triple.getObject()); } System.out.println("==============================================================================="); } /** * @see * org.ala.harvester.Harvester#setRepository(org.ala.repository.Repository) */ public void setRepository(Repository repository) { this.repository = repository; } @Override public void setDocumentMapper(DocumentMapper documentMapper) { // TODO Auto-generated method stub } }