Java tutorial
/*************************************************************************** * Copyright (C) 2010 Atlas of Living Australia * All Rights Reserved. * * The contents of this file are subject to the Mozilla Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or * implied. See the License for the specific language governing * rights and limitations under the License. ***************************************************************************/ package org.ala.hbase; import java.io.File; import java.io.FileFilter; import java.io.FileOutputStream; import java.io.FileReader; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.Hashtable; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.inject.Inject; import org.ala.client.util.RestfulClient; import org.ala.dao.InfoSourceDAO; import org.ala.dao.SolrUtils; import org.ala.dao.TaxonConceptDao; import org.ala.model.Document; import org.ala.model.InfoSource; import org.ala.model.Triple; import org.ala.repository.Predicates; import org.ala.util.FileType; import org.ala.util.PartialIndex; import org.ala.util.RepositoryFileUtils; import org.ala.util.SpringUtils; import org.ala.util.TurtleUtils; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.PostMethod; import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.DirectoryFileFilter; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.time.DateUtils; import org.apache.log4j.Logger; import org.codehaus.jackson.map.DeserializationConfig; import org.codehaus.jackson.map.ObjectMapper; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.ApplicationContext; import org.springframework.stereotype.Component; /** * Data Loader that scans through a BIE repository, find triples * and adds them to concepts held in the profiler. * * @author Dave Martin */ @Component public class RepoDataLoader { protected static Logger logger = Logger.getLogger(RepoDataLoader.class); protected static String repositoryDir = "/data/bie"; @Inject protected TaxonConceptDao taxonConceptDao; protected Map<Integer, InfoSource> infoSourceMap; protected HashMap<String, Integer> uidInfoSourceMap; protected List<String> guidList; protected String reindexUrl; @Inject protected InfoSourceDAO infoSourceDAO; @Inject protected RepositoryFileUtils repoFileUtils; @Inject protected SolrUtils solrUtils; private boolean statsOnly = false; private boolean reindex = false; private boolean gList = false; private FileOutputStream guidOut = null; private PartialIndex indexer; int totalFilesRead = 0; int totalPropertiesSynced = 0; protected static Pattern uidPattern = Pattern.compile("(?:[\"]*)?(?:[a-z_]*_uid:\")([a-z0-9]*)(?:[\"]*)?"); /** * This takes a list of infosource ids... * <p/> * Usage: -stats or -reindex or -gList and list of infosourceId * * @param args */ public static void main(String[] args) throws Exception { //RepoDataLoader loader = new RepoDataLoader(); ApplicationContext context = SpringUtils.getContext(); RepoDataLoader loader = (RepoDataLoader) context.getBean(RepoDataLoader.class); long start = System.currentTimeMillis(); loader.loadInfoSources(); String filePath = repositoryDir; if (args.length > 0) { if (args[0].equalsIgnoreCase("-stats")) { loader.statsOnly = true; args = (String[]) ArrayUtils.subarray(args, 1, args.length); } if (args[0].equalsIgnoreCase("-reindex")) { loader.reindex = true; loader.indexer = context.getBean(PartialIndex.class); args = (String[]) ArrayUtils.subarray(args, 1, args.length); logger.info("**** -reindex: " + loader.reindex); logger.debug("reindex url: " + loader.reindexUrl); } if (args[0].equalsIgnoreCase("-gList")) { loader.gList = true; args = (String[]) ArrayUtils.subarray(args, 1, args.length); logger.info("**** -gList: " + loader.gList); } if (args[0].equalsIgnoreCase("-biocache")) { Hashtable<String, String> hashTable = new Hashtable<String, String>(); hashTable.put("accept", "application/json"); ObjectMapper mapper = new ObjectMapper(); mapper.getDeserializationConfig().set(DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES, false); RestfulClient restfulClient = new RestfulClient(0); String fq = "&fq="; if (args.length > 1) { java.util.Date date = new java.util.Date(); if (args[1].equals("-lastWeek")) { date = DateUtils.addWeeks(date, -1); } else if (args[1].equals("-lastMonth")) { date = DateUtils.addMonths(date, -1); } else if (args[1].equals("-lastYear")) { date = DateUtils.addYears(date, -1); } else date = null; if (date != null) { SimpleDateFormat sfd = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); fq += "last_load_date:%5B" + sfd.format(date) + "%20TO%20*%5D"; } } Object[] resp = restfulClient .restGet("http://biocache.ala.org.au/ws/occurrences/search?q=multimedia:Image" + fq + "&facets=data_resource_uid&pageSize=0", hashTable); logger.info("The URL: " + "http://biocache.ala.org.au/ws/occurrences/search?q=multimedia:Image" + fq + "&facets=data_resource_uid&pageSize=0"); if ((Integer) resp[0] == HttpStatus.SC_OK) { String content = resp[1].toString(); logger.debug(resp[1]); if (content != null && content.length() > "[]".length()) { Map map = mapper.readValue(content, Map.class); try { List<java.util.LinkedHashMap<String, String>> list = ((List<java.util.LinkedHashMap<String, String>>) ((java.util.LinkedHashMap) ((java.util.ArrayList) map .get("facetResults")).get(0)).get("fieldResult")); Set<String> arg = new LinkedHashSet<String>(); for (int i = 0; i < list.size(); i++) { java.util.LinkedHashMap<String, String> value = list.get(i); String dataResource = getDataResource(value.get("fq")); Object provider = (loader.getUidInfoSourceMap().get(dataResource)); if (provider != null) { arg.add(provider.toString()); } } logger.info("Set of biocache infosource ids to load: " + arg); args = new String[] {}; args = arg.toArray(args); //handle the situation where biocache-service reports no data resources if (args.length < 1) { logger.error("No biocache data resources found. Unable to load."); System.exit(0); } } catch (Exception e) { logger.error("ERROR: exit process....." + e); e.printStackTrace(); System.exit(0); } } } else { logger.warn("Unable to process url: "); } } } int filesRead = loader.load(filePath, args); //FIX ME - move to config long finish = System.currentTimeMillis(); logger.info(filesRead + " files scanned/loaded in: " + ((finish - start) / 60000) + " minutes " + ((finish - start) / 1000) + " seconds."); System.exit(1); } public static String getDataResource(String fq) { Matcher m = uidPattern.matcher(fq); if (m.matches()) { return m.group(1); } return fq; } public int load(String filePath, String[] repoDirs) throws Exception { return load(filePath, repoDirs, true); } /** * Scan through the single directory, retrieve triples and * add to taxon concepts. used by admin imageUpload * * @param repoDir scan single directory * @throws Exception */ public int singleImageUploadLoad(String repoDir) throws Exception { // reset counts totalFilesRead = 0; totalPropertiesSynced = 0; //start scan File dir = null; dir = new File(repoDir); logger.info("Processing directories..." + dir.getAbsolutePath()); if (dir.isDirectory()) { File[] dirsToScan = { dir }; scanDirectory(dirsToScan); } logger.info("Files read: " + totalFilesRead + ", files matched: " + totalPropertiesSynced); return totalFilesRead; } /** * Scan through the repository, retrieve triples and * add to taxon concepts * * @param filePath Root directory of harvested repository * @param repoDirs Optional array of Infosource directories to scan passed as program arguments * @throws Exception */ public int load(String filePath, String[] repoDirs, boolean allowStats) throws Exception { guidList = new ArrayList<String>(); String lsidFileName = "/data/bie/repoLoader_guid_" + System.currentTimeMillis() + ".csv"; FileOutputStream statsOut = null; logger.info("Scanning directory: " + filePath); //open the statistics file if (allowStats) { statsOut = FileUtils.openOutputStream( new File("/data/bie/bie_name_matching_stats_" + System.currentTimeMillis() + ".csv")); statsOut.write( "InfoSource ID, InfoSource Name, URL, ANBG matches, Other matches, Missing, Homonyms detected\n" .getBytes()); } if (gList || reindex) { guidOut = FileUtils.openOutputStream(new File(lsidFileName)); } // reset counts totalFilesRead = 0; totalPropertiesSynced = 0; //start scan File file = new File(filePath); File[] dirs = null; // See if array of infosource directories passed as program arguments if (repoDirs.length > 0) { dirs = new File[repoDirs.length]; for (int i = 0; i < repoDirs.length; i++) { dirs[i] = new File(file.getAbsolutePath() + File.separator + repoDirs[i]); logger.info("Processing directories..." + dirs[i].getAbsolutePath()); } } else { //list immediate directories - this will give the logger.info("Listing all directories..."); dirs = file.listFiles(); } //go through each infosource directory for (File childFile : dirs) { logger.info("Listing directories for infosource directory: " + childFile.getAbsolutePath()); if (childFile.isDirectory()) { taxonConceptDao.resetStats(); // takes us to /data/bie/<infosource-id>/<section-id> logger.info("Listing directories for the section: " + childFile.getAbsolutePath()); File[] infosourceSection = childFile.listFiles(); for (File sectionDirectory : infosourceSection) { //this will list all the files in the if (sectionDirectory.isDirectory()) { File[] dirsToScan = sectionDirectory.listFiles((FileFilter) DirectoryFileFilter.DIRECTORY); scanDirectory(dirsToScan); } } if (allowStats) { //report the stats if (org.apache.commons.lang.StringUtils.isNumeric(childFile.getName())) { InfoSource infoSource = infoSourceMap.get(new Integer(childFile.getName())); taxonConceptDao.reportStats(statsOut, infoSource.getId() + "," + infoSource.getName() + "," + infoSource.getWebsiteUrl()); } } } } logger.info("Files read: " + totalFilesRead + ", files matched: " + totalPropertiesSynced); if (allowStats) { statsOut.flush(); statsOut.close(); } if (gList) { guidOut.flush(); guidOut.close(); } if (reindex) { if (!gList) { //only want to include unique lsids Set<String> guids = new java.util.HashSet<String>(guidList); for (String guid : guids) guidOut.write((guid + "\n").getBytes()); guidOut.flush(); guidOut.close(); } //NC 2013-045-30: use the Partial Index to automatically reindex the values in the file. This batches them into manageable chunks indexer.process(lsidFileName); // //This results in SOLR file locking problems. // //solrUtils.getSolrServer().commit(); // // // need to call http://bie.ala.org.au/ws/admin/reindex with a JSON array of GUIDS to reindex // logger.debug("Calling bie service to reindex " + guidList.size()); // HttpClient httpClient = new HttpClient(); // PostMethod post = new PostMethod(reindexUrl); // ObjectMapper mapper = new ObjectMapper(); // // //// StringBuilder jsonBuilder = new StringBuilder(); //// jsonBuilder.append("["); //// for (int i = 0; i < guidList.size(); i++) { //// jsonBuilder.append("\"" + guidList.get(i) + "\""); //// //// if (i < guidList.size() - 1) { //// jsonBuilder.append(","); //// } //// } //// jsonBuilder.append("]"); // // post.setRequestHeader("Content-Type", "application/json"); // post.setRequestBody(mapper.writeValueAsString(guidList)); // // try { // int returnCode = httpClient.executeMethod(post); // if (returnCode != 200) { // logger.error("Error submitting reindex request: " + post.getResponseBodyAsString()); // } // } catch (Exception ex) { // logger.error("Error submitting reindex request", ex); // logger.info(guidList); // } } return totalFilesRead; } /** * Retrieve the scientific name from the list of triples. * * @param triples * @return scientific name if found, null otherwise */ private String getScientificName(List<Triple> triples) { for (Triple triple : triples) { if (triple.predicate.equalsIgnoreCase(Predicates.SCIENTIFIC_NAME.toString())) { return triple.object.toString(); } } return null; } /** * Scan through the supplied directories. * * @param dirs */ public void scanDirectory(File[] dirs) { int filesRead = 0; int propertiesSynced = 0; for (File currentDir : dirs) { logger.info("Reading directory: " + currentDir.getAbsolutePath()); Iterator<File> fileIterator = FileUtils.iterateFiles(currentDir, null, true); while (fileIterator.hasNext()) { File currentFile = fileIterator.next(); if (currentFile.getName().equals(FileType.RDF.toString())) { filesRead++; String infosourceId = currentFile.getParentFile().getParentFile().getParentFile().getName(); String infoSourceUid = infoSourceDAO.getUidByInfosourceId(String.valueOf(infosourceId)); //read the dublin core in the same directory - determine if its an image try { logger.info("Reading file: " + currentFile.getAbsolutePath()); FileReader reader = new FileReader(currentFile); List<Triple> triples = TurtleUtils.readTurtle(reader); //close the reader reader.close(); String currentSubject = null; List<Triple> splitBySubject = new ArrayList<Triple>(); String guid = null; //iterate through triple, splitting the triples by subject for (Triple triple : triples) { if (currentSubject == null) { currentSubject = triple.subject; } else if (!currentSubject.equals(triple.subject)) { //sync these triples // /data/bie/1036/23/235332/rdf guid = sync(currentFile, splitBySubject, infosourceId, infoSourceUid); if (guid != null && guid.trim().length() > 0) { propertiesSynced++; } //clear list splitBySubject.clear(); currentSubject = triple.subject; } splitBySubject.add(triple); } //sort out the buffer if (!splitBySubject.isEmpty()) { guid = sync(currentFile, splitBySubject, infosourceId, infoSourceUid); if (guid != null && guid.trim().length() > 0) { propertiesSynced++; } } if (gList && guid != null) { guidOut.write((guid + "\n").getBytes()); } guidList.add(guid); } catch (Exception e) { logger.error("Error reading triples from file: '" + currentFile.getAbsolutePath() + "', " + e.getMessage(), e); } } } logger.info("InfosourceId: " + currentDir.getName() + " - Files read: " + filesRead + ", files matched: " + propertiesSynced); totalFilesRead += filesRead; totalPropertiesSynced += propertiesSynced; } } /** * Synchronize triples to database. * * @param currentFile * @param triples * @throws Exception */ private String sync(File currentFile, List<Triple> triples, String infosourceId, String infoSourceUid) throws Exception { String documentId = currentFile.getParentFile().getName(); // Read dublin core // Added info source data to the Document via info source Map InfoSource infoSource = infoSourceMap.get(new Integer(infosourceId)); Document document = readDcFile(currentFile); document.setId(Integer.parseInt(documentId)); document.setInfoSourceId(infoSource.getId()); document.setInfoSourceName(infoSource.getName()); document.setInfoSourceUri(infoSource.getWebsiteUrl()); document.setFilePath(currentFile.getParentFile().getAbsolutePath()); if (infoSourceUid != null && !"".equals(infoSourceUid)) { document.setInfoSourceUid(infoSourceUid); } Map<String, String> dc = readDcFileAsMap(currentFile); // Sync the triples and associated DC data logger.info("Attempting to sync triple where Scientific Name = " + getScientificName(triples)); String guid = taxonConceptDao.syncTriples(document, triples, dc, statsOnly); // boolean success = taxonConceptDao.syncTriplesReturnSuccess(document, triples, dc, statsOnly); logger.info("Processed file: " + currentFile.getAbsolutePath() + ", Scientific Name = " + getScientificName(triples) + ", guid: " + guid); return guid; } /** * Initialise the info source map * * @return infoSourceMap */ public void loadInfoSources() { this.infoSourceMap = new HashMap<Integer, InfoSource>(); this.uidInfoSourceMap = new HashMap<String, Integer>(); if (infoSourceDAO != null) { List<Integer> allIds = infoSourceDAO.getIdsforAll(); Map<String, String> allUids = infoSourceDAO.getInfosourceIdUidMap(); for (Integer id : allIds) { infoSourceMap.put(id, infoSourceDAO.getById(id)); if (allUids.get(id.toString()) != null && !"".equals(allUids.get(id.toString()))) { uidInfoSourceMap.put(allUids.get(id.toString()), id); } } } logger.info("loaded infoSource map: " + infoSourceMap.size()); } /** * Read dc file and populate a Document with values from file * * @param currentFile * @return doc the Document to return */ private Document readDcFile(File currentFile) { Document doc = new Document(); String rdfFileName = currentFile.getAbsolutePath(); String dcFileName = rdfFileName.replaceFirst("rdf", "dc"); File dcfile = new File(dcFileName); List<String[]> dcContents = new ArrayList<String[]>(); try { dcContents = repoFileUtils.readRepositoryFile(dcfile); for (String[] line : dcContents) { // expect 2 element String array (key, value) if (line[0].equalsIgnoreCase(Predicates.DC_IDENTIFIER.toString())) { doc.setIdentifier(line[1]); } else if (line[0].equalsIgnoreCase(Predicates.DC_TITLE.toString())) { doc.setTitle(line[1]); } else if (line[0].equalsIgnoreCase(Predicates.DC_FORMAT.toString())) { doc.setMimeType(line[1]); } } } catch (Exception ex) { logger.error("Cannot open dc file: " + dcFileName + " - " + ex.getMessage()); } return doc; } /** * Read dc file and populate a Document with values from file * * @param currentFile * @return doc the Docuement to return */ private Map<String, String> readDcFileAsMap(File currentFile) { String rdfFileName = currentFile.getAbsolutePath(); String dcFileName = rdfFileName.replaceFirst("rdf", "dc"); File dcfile = new File(dcFileName); Map<String, String> dc = null; try { dc = repoFileUtils.readDcFileAsMap(dcfile); } catch (Exception ex) { logger.error("Cannot open dc file: " + dcFileName + " - " + ex.getMessage()); } return dc; } /** * @param taxonConceptDao the taxonConceptDao to set */ public void setTaxonConceptDao(TaxonConceptDao taxonConceptDao) { this.taxonConceptDao = taxonConceptDao; } public static String getRepositoryDir() { return repositoryDir; } public HashMap<String, Integer> getUidInfoSourceMap() { return uidInfoSourceMap; } public void setReindexUrl(String reindexUrl) { this.reindexUrl = reindexUrl; } }