Java tutorial
/** * Symplectic to Vivo Connector * Copyright (c) 2011 Ian Boston * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package uk.co.tfd.symplectic.harvester; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.security.NoSuchAlgorithmException; import java.sql.SQLException; import java.util.Arrays; import java.util.HashSet; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.FutureTask; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactoryConfigurationError; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.vivoweb.harvester.util.InitLog; import org.vivoweb.harvester.util.args.ArgDef; import org.vivoweb.harvester.util.args.ArgList; import org.vivoweb.harvester.util.args.ArgParser; import org.vivoweb.harvester.util.repo.RecordHandler; import org.w3c.dom.DOMException; import org.w3c.dom.Document; import org.xml.sax.SAXException; public class SymplecticFetch { /** * boolean arg, if true, lists will be re-fetched and anything that is new * will get fetched, default is false */ private static final String UPDATE_LIST_ARG = "updateList"; /** * Integer Argument name: The max number of pages that should be loaded from * a list, default is 20 */ private static final String LIMIT_LIST_PAGES_ARG = "limitListPages"; /** * Integer Argument name: The maximum number of URLs to get in a single run, * default is 10000 */ private static final String MAX_URL_GET_ARG = "maxUrlGet"; /** * String Argument name: The URL of the categories in the Elements server, * no default, required */ private static final String URL_ARG = "url"; /** * String Argument name: a , separated list of object types that will be encountered including * those in the initial seed of the graph. */ private static final String OBJECT_TYPES_ARG = "categories"; /** * String Argument name: A , separated list of categories that should not be loaded from * relationships, no default required. */ private static final String EXCLUDED_RELATIONSHIP_OBJECT_TYPES_ARG = "excludeRelationshipObjectCategories"; /** * String Argument name: THe query URL used to seed the fetch operation. * defaults to baseUrl + "/objects?categories=users" */ private static final String SEED_QUERY_URL_ARG = "seedQuery"; private static final Logger LOGGER = LoggerFactory.getLogger(SymplecticFetch.class); private static String database = "symplectic"; private RecordHandler rh; private String baseUrl; private int maxUrlFetch; private int limitListPages; private boolean updateLists; private String[] objectTypes; private long lastLog = System.currentTimeMillis(); private Set<String> excludedRelationshipObjectTypes; private String seedQueryUrl; protected SymplecticFetch(RecordHandler rh, String database) { if (rh == null) { throw new RuntimeException("Record Handler cant be null"); } System.err.println("Using record handler " + rh); this.rh = rh; } /** * Constructor * * @param argList * parsed argument list * @param database * database name * @throws IOException * error creating task */ protected SymplecticFetch(ArgList argList) throws IOException { this(RecordHandler.parseConfig(argList.get("o"), argList.getValueMap("O")), database); baseUrl = argList.get(URL_ARG); maxUrlFetch = Integer.parseInt(argList.get(MAX_URL_GET_ARG)); limitListPages = Integer.parseInt(argList.get(LIMIT_LIST_PAGES_ARG)); updateLists = Boolean.parseBoolean(argList.get(UPDATE_LIST_ARG)); objectTypes = StringUtils.split(argList.get(OBJECT_TYPES_ARG), ","); excludedRelationshipObjectTypes = toSet( StringUtils.split(argList.get(EXCLUDED_RELATIONSHIP_OBJECT_TYPES_ARG), ",")); seedQueryUrl = argList.get(SEED_QUERY_URL_ARG); if (seedQueryUrl == null || seedQueryUrl.trim().length() == 0) { seedQueryUrl = baseUrl + "/objects?categories=users"; } LOGGER.info("Config: Elements API at {} ", baseUrl); LOGGER.info("Config: Seed Query {} ", seedQueryUrl); LOGGER.info("Config: Max Number of URLs to fetch {} ", maxUrlFetch); LOGGER.info("Config: Max Number of Pages to list {} ", limitListPages); LOGGER.info("Config: Refetch lists {} ", updateLists); LOGGER.info("Config: Object Types {} ", Arrays.toString(objectTypes)); LOGGER.info("Config: Excluded Types in relationships {} ", Arrays.toString(excludedRelationshipObjectTypes.toArray())); LOGGER.info("To change any of these edit {} ", argList.get("X")); } private Set<String> toSet(String[] values) { Set<String> s = new HashSet<String>(); for (String v : values) { s.add(v); } return s; } public static void main(String[] args) { try { InitLog.initLogger(args, getParser("SymplecticFetch", database)); LOGGER.info("SymplecticFetch: Start"); SymplecticFetch sf = new SymplecticFetch(getParser("SymplecticFetch", database).parse(args)); sf.execute(); } catch (Exception e) { LOGGER.error(e.getMessage(), e); LOGGER.debug("Stacktrace:", e); System.out.println(getParser("SymplecticFetch", database).getUsage()); System.exit(1); } LOGGER.info("SymplecticFetch: End"); } /** * Executes the task * * @param baseUrl * * @throws UnsupportedEncodingException * * @throws IOException * error processing search * @throws TransformerException * @throws TransformerFactoryConfigurationError * @throws ParserConfigurationException * @throws SAXException * @throws DOMException * @throws NoSuchAlgorithmException * @throws AtomEntryLoadException */ private void execute() throws DOMException, NoSuchAlgorithmException, UnsupportedEncodingException, IOException, SAXException, ParserConfigurationException, TransformerFactoryConfigurationError, TransformerException { ProgressTracker progress = null; try { progress = new JDBCProgressTrackerImpl(rh, limitListPages, updateLists, objectTypes, excludedRelationshipObjectTypes); } catch (SQLException e) { LOGGER.info(e.getMessage(), e); progress = new FileProgressTrackerImpl("loadstate", rh, limitListPages, updateLists, objectTypes, excludedRelationshipObjectTypes); } catch (IOException e) { LOGGER.info(e.getMessage(), e); progress = new FileProgressTrackerImpl("loadstate", rh, limitListPages, updateLists, objectTypes, excludedRelationshipObjectTypes); } // re-scan relationships to extract API objects // reScanRelationships(progress); // baseUrl + "/objects?categories=users&groups=3" progress.toload(seedQueryUrl, new APIObjects(rh, "users", progress, limitListPages, objectTypes)); // progress.toload(baseUrl+"publication", new APIObjects(rh, // "publications", progress)); int i = 0; int threadPoolSize = 20; ExecutorService executorService = Executors.newFixedThreadPool(threadPoolSize); final ConcurrentHashMap<String, FutureTask<String>> worklist = new ConcurrentHashMap<String, FutureTask<String>>(); while (i < maxUrlFetch) { Entry<String, AtomEntryLoader> next = progress.next(); if (next == null) { int startingWorklistSize = worklist.size(); while (worklist.size() > 0 && worklist.size() >= startingWorklistSize) { consumeTasks(worklist, progress); if (worklist.size() >= startingWorklistSize) { try { Thread.sleep(500); } catch (InterruptedException e) { } } } if (!progress.hasPending() && worklist.size() == 0) { break; // there are none left to come, the workers are empty, and so is pending } } else { final AtomEntryLoader loader = next.getValue(); final String key = next.getKey(); FutureTask<String> task = new FutureTask<String>(new Callable<String>() { @Override public String call() throws Exception { try { loader.loadEntry(key); } catch (Exception e) { LOGGER.error(e.getMessage(), e); } return "Done Loading " + key; } }); worklist.put(key, task); executorService.execute(task); i++; // dont overfill the queue while (worklist.size() > threadPoolSize * 2) { consumeTasks(worklist, progress); if (worklist.size() > threadPoolSize) { try { Thread.sleep(500); } catch (InterruptedException e) { } } } } } while (worklist.size() > 0) { consumeTasks(worklist, progress); Thread.yield(); } executorService.shutdown(); LOGGER.info("End ToDo list contains {} urls ", progress.pending()); progress.dumpLoaded(); progress.checkpoint(); } private void consumeTasks(ConcurrentHashMap<String, FutureTask<String>> worklist, ProgressTracker tracker) { for (Entry<String, FutureTask<String>> e : worklist.entrySet()) { if (e.getValue().isDone()) { try { LOGGER.info("Recieved " + e.getValue().get()); } catch (Exception e1) { LOGGER.info("Failed {} ", e.getKey(), e1); } worklist.remove(e.getKey()); } } if (System.currentTimeMillis() > lastLog + 5000) { LOGGER.info("Current Worklist Backlog {} In Pending or Loading state {} ", worklist.size(), tracker.pending()); lastLog = System.currentTimeMillis(); } } @SuppressWarnings("unused") private void reScanRelationships(ProgressTracker tracker) { File publicationsXml = new File("data/raw-records"); APIObject userObject = new APIObject(rh, "user", tracker, limitListPages, objectTypes); APIObject publicationObject = new APIObject(rh, "publication", tracker, limitListPages, objectTypes); for (File f : publicationsXml.listFiles()) { if (f.getName().startsWith("relationship")) { try { Document doc = XmlAide.loadXmlDocument(f.toURI().toURL().toString()); userObject.loadEntrys(doc); publicationObject.loadEntrys(doc); } catch (Exception e) { LOGGER.error(e.getMessage(), e); } } } } /** * Get the ArgParser for this task * * @param appName * the application name * @param database * the database name * @return the ArgParser */ protected static ArgParser getParser(String appName, String database) { ArgParser parser = new ArgParser(appName); parser.addArgument(new ArgDef().setShortOption('o').setLongOpt("output") .setDescription("RecordHandler config file path").withParameter(true, "CONFIG_FILE") .setRequired(false)); parser.addArgument(new ArgDef().setShortOption('O').setLongOpt("outputOverride") .withParameterValueMap("RH_PARAM", "VALUE") .setDescription("override the RH_PARAM of output recordhandler using VALUE").setRequired(false)); parser.addArgument(new ArgDef().setLongOpt(URL_ARG).setRequired(true).withParameter(true, "url") .setDescription("URL of the Symplectic Elements API")); parser.addArgument(new ArgDef().setLongOpt(OBJECT_TYPES_ARG).setRequired(false) .withParameter(true, "category").setDefaultValue("user,publication,grant,activity") .setDescription("Categories to extract")); parser.addArgument(new ArgDef().setLongOpt(EXCLUDED_RELATIONSHIP_OBJECT_TYPES_ARG).setRequired(false) .withParameter(true, "exclude").setDefaultValue("user") .setDescription("Categories to exclude from crawl through relationships")); parser.addArgument( new ArgDef().setLongOpt(SEED_QUERY_URL_ARG).setRequired(false).withParameter(true, "seed") .setDefaultValue("").setDescription("The seed url used to seed the graph of objects")); parser.addArgument(new ArgDef().setLongOpt(UPDATE_LIST_ARG).setRequired(false).setDefaultValue("false") .withParameter(true, "num") .setDescription("If true, Atom Feeds that return lists will be rescanned for changes")); parser.addArgument(new ArgDef().setLongOpt(LIMIT_LIST_PAGES_ARG).setRequired(false).setDefaultValue("20") .withParameter(true, "num") .setDescription("The maximum number of pages in a list that will be retrieved")); parser.addArgument(new ArgDef().setLongOpt(MAX_URL_GET_ARG).setRequired(false).setDefaultValue("1000") .withParameter(true, "num") .setDescription("The maximum number of urls that will be retrieved in a run")); return parser; } }