Java tutorial
/* * #%L * Netarchivesuite - harvester * %% * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, * the National Library of France and the Austrian National Library. * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 2.1 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Lesser Public License for more details. * * You should have received a copy of the GNU General Lesser Public * License along with this program. If not, see * <http://www.gnu.org/licenses/lgpl-2.1.html>. * #L% */ package dk.netarkivet.harvester.datamodel; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.sql.SQLException; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import javax.servlet.jsp.PageContext; import org.apache.commons.io.LineIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.antiaction.raptor.dao.AttributeBase; import com.antiaction.raptor.dao.AttributeTypeBase; import dk.netarkivet.common.exceptions.ArgumentNotValid; import dk.netarkivet.common.exceptions.IOFailure; import dk.netarkivet.common.exceptions.UnknownID; import dk.netarkivet.common.utils.DomainUtils; import dk.netarkivet.common.utils.I18n; import dk.netarkivet.harvester.datamodel.dao.DAOProviderFactory; import dk.netarkivet.harvester.datamodel.eav.EAV; import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType; import dk.netarkivet.harvester.webinterface.EventHarvestUtil; /** * This class contains the specific properties and operations of harvest definitions which are not snapshot harvest * definitions. I.e. this class models definitions of event and selective harvests. */ public class PartialHarvest extends HarvestDefinition { private static final Logger log = LoggerFactory.getLogger(PartialHarvest.class); /** * Set of domain configurations being harvested by this harvest. Entries in this set are unique on configuration * name + domain name. */ private Map<SparseDomainConfiguration, DomainConfiguration> domainConfigurations = new HashMap<SparseDomainConfiguration, DomainConfiguration>(); /** The schedule used by this PartialHarvest. */ private Schedule schedule; /** * The next date this harvest definition should run, null if never again. */ private Date nextDate; /** * Create new instance of a PartialHavest configured according to the properties of the supplied * DomainConfiguration. * * @param domainConfigurations a list of domain configurations * @param schedule the harvest definition schedule * @param harvestDefName the name of the harvest definition * @param comments comments * @param audience The intended audience for this harvest (could be null) */ public PartialHarvest(List<DomainConfiguration> domainConfigurations, Schedule schedule, String harvestDefName, String comments, String audience) { super(DAOProviderFactory.getExtendedFieldDAOProvider()); ArgumentNotValid.checkNotNull(schedule, "schedule"); ScheduleDAO.getInstance().read(schedule.getName()); ArgumentNotValid.checkNotNullOrEmpty(harvestDefName, "harvestDefName"); ArgumentNotValid.checkNotNull(comments, "comments"); ArgumentNotValid.checkNotNull(domainConfigurations, "domainConfigurations"); this.numEvents = 0; addConfigurations(domainConfigurations); this.schedule = schedule; this.harvestDefName = harvestDefName; this.comments = comments; this.nextDate = schedule.getFirstEvent(new Date()); this.audience = audience; } /** * Returns the schedule defined for this harvest definition. * * @return schedule */ public Schedule getSchedule() { return schedule; } /** * Set the schedule to be used for this harvestdefinition. * * @param schedule A schedule for when to try harvesting. */ public void setSchedule(Schedule schedule) { ArgumentNotValid.checkNotNull(schedule, "schedule"); this.schedule = schedule; if (nextDate != null) { setNextDate(schedule.getFirstEvent(nextDate)); } } /** * Get the next date this harvest definition should be run. * * @return The next date the harvest definition should be run or null, if the harvest definition should never run * again. */ public Date getNextDate() { return nextDate; } /** * Set the next date this harvest definition should be run. * * @param nextDate The next date the harvest definition should be run. May be null, meaning never again. */ public void setNextDate(Date nextDate) { this.nextDate = nextDate; } /** * Remove domainconfiguration from this partialHarvest. * * @param dcKey domainConfiguration key */ public void removeDomainConfiguration(SparseDomainConfiguration dcKey) { ArgumentNotValid.checkNotNull(dcKey, "DomainConfigurationKey dcKey"); if (domainConfigurations.remove(dcKey) == null) { log.warn("Unable to delete domainConfiguration '{}' from {}. Reason: didn't exist.", dcKey, this); } } /** * Add a new domainconfiguration to this PartialHarvest. * * @param newConfiguration A new DomainConfiguration */ public void addDomainConfiguration(DomainConfiguration newConfiguration) { ArgumentNotValid.checkNotNull(newConfiguration, "DomainConfiguration newConfiguration"); SparseDomainConfiguration key = new SparseDomainConfiguration(newConfiguration); if (domainConfigurations.containsKey(key)) { log.warn("Unable to add domainConfiguration '{}' from {}. Reason: does already exist.", newConfiguration, this); } else { domainConfigurations.put(key, newConfiguration); } } /** * Returns a List of domain configurations for this harvest definition. * * @return List containing information about the domain configurations */ public Iterator<DomainConfiguration> getDomainConfigurations() { return domainConfigurations.values().iterator(); } /** * @return the domainconfigurations as a list */ public Collection<DomainConfiguration> getDomainConfigurationsAsList() { return domainConfigurations.values(); } /** * Set the list of configurations that this PartialHarvest uses. * * @param configs List<DomainConfiguration> the configurations that this harvestdefinition will use. */ public void setDomainConfigurations(List<DomainConfiguration> configs) { ArgumentNotValid.checkNotNull(configs, "configs"); domainConfigurations.clear(); addConfigurations(configs); } /** * Add the list of configurations to the configuration associated with this PartialHarvest. * * @param configs a List of configurations */ private void addConfigurations(List<DomainConfiguration> configs) { for (DomainConfiguration dc : configs) { addConfiguration(dc); } } /** * Add a configuration to this PartialHarvest. * * @param dc the given configuration */ private void addConfiguration(DomainConfiguration dc) { domainConfigurations.put(new SparseDomainConfiguration(dc), dc); } /** * Reset the harvest definition to no harvests and next date being the first possible for the schedule. */ public void reset() { numEvents = 0; nextDate = schedule.getFirstEvent(new Date()); } /** * Check if this harvest definition should be run, given the time now. * * @param now The current time * @return true if harvest definition should be run */ public boolean runNow(Date now) { ArgumentNotValid.checkNotNull(now, "now"); if (!getActive()) { return false; // inactive definitions are never run } return nextDate != null && now.compareTo(nextDate) >= 0; } /** * Returns whether this HarvestDefinition represents a snapshot harvest. * * @return false (always) */ public boolean isSnapShot() { return false; } /** * Always returns no limit. * * @return 0, meaning no limit. */ public long getMaxCountObjects() { return Constants.HERITRIX_MAXOBJECTS_INFINITY; } /** * Always returns no limit. * * @return -1, meaning no limit. */ public long getMaxBytes() { return Constants.HERITRIX_MAXBYTES_INFINITY; } /** * Takes a seed list and creates any necessary domains, configurations, and seedlists to enable them to be harvested * with the given template and other parameters. <A href="https://sbforge.org/jira/browse/NAS-1317">JIRA issue * NAS-1317</A> addresses this issue. Current naming of the seedlists and domainconfigurations are: one of <br> * harvestdefinitionname + "_" + templateName + "_" + "UnlimitedBytes" (if maxbytes is negative)<br> * harvestdefinitionname + "_" + templateName + "_" + maxBytes + "Bytes" (if maxbytes is zero or postive). * * @param seeds a list of the seeds to be added * @param templateName the name of the template to be used * @param maxBytes Maximum number of bytes to harvest per domain * @param maxObjects Maximum number of objects to harvest per domain * @param attributeValues Attributes read from webpage * @see EventHarvestUtil#addConfigurations(PageContext, I18n, String) for details */ public void addSeeds(Set<String> seeds, String templateName, long maxBytes, int maxObjects, Map<String, String> attributeValues) { ArgumentNotValid.checkNotNull(seeds, "seeds"); ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName"); if (!TemplateDAO.getInstance().exists(templateName)) { throw new UnknownID("No such template: " + templateName); } Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>(); StringBuilder invalidMessage = new StringBuilder( "Unable to create an event harvest.\n" + "The following seeds are invalid:\n"); boolean valid = true; // validate: for (String seed : seeds) { boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds); if (!seedValid) { valid = false; } } if (!valid) { throw new ArgumentNotValid(invalidMessage.toString()); } addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues); } /** * This method is a duplicate of the addSeeds method but for seedsFile parameter * * @param seedsFile a newline-separated File containing the seeds to be added * @param templateName the name of the template to be used * @param maxBytes Maximum number of bytes to harvest per domain * @param maxObjects Maximum number of objects to harvest per domain */ public void addSeedsFromFile(File seedsFile, String templateName, long maxBytes, int maxObjects, Map<String, String> attributeValues) { ArgumentNotValid.checkNotNull(seedsFile, "seeds"); ArgumentNotValid.checkTrue(seedsFile.isFile(), "seedsFile does not exist"); ArgumentNotValid.checkNotNullOrEmpty(templateName, "templateName"); if (!TemplateDAO.getInstance().exists(templateName)) { throw new UnknownID("No such template: " + templateName); } Map<String, Set<String>> acceptedSeeds = new HashMap<String, Set<String>>(); StringBuilder invalidMessage = new StringBuilder( "Unable to create an event harvest.\n" + "The following seeds are invalid:\n"); boolean valid = true; // validate all the seeds in the file // those accepted are entered into the acceptedSeeds datastructure // Iterate through the contents of the file LineIterator seedIterator = null; try { seedIterator = new LineIterator(new FileReader(seedsFile)); while (seedIterator.hasNext()) { String seed = seedIterator.next(); boolean seedValid = processSeed(seed, invalidMessage, acceptedSeeds); if (!seedValid) { valid = false; } } } catch (IOException e) { throw new IOFailure("Unable to process seedsfile ", e); } finally { LineIterator.closeQuietly(seedIterator); } if (!valid) { throw new ArgumentNotValid(invalidMessage.toString()); } addSeedsToDomain(templateName, maxBytes, maxObjects, acceptedSeeds, attributeValues); } /** * Process each seed. * * @param seed The given seed. * @param invalidMessage The message builder where the invalid seeds are added. * @param acceptedSeeds The set of accepted seeds * @return true, if the processed seed is valid or empty. */ private boolean processSeed(String seed, StringBuilder invalidMessage, Map<String, Set<String>> acceptedSeeds) { seed = seed.trim(); if (seed.length() != 0 && !seed.startsWith("#") && !seed.startsWith("//")) { // ignore empty lines and comments if (!(seed.toLowerCase().startsWith("http://") || seed.toLowerCase().startsWith("https://"))) { seed = "http://" + seed; } URL url = null; try { url = new URL(seed); } catch (MalformedURLException e) { invalidMessage.append(seed); invalidMessage.append('\n'); return false; } String host = url.getHost(); String domainName = DomainUtils.domainNameFromHostname(host); if (domainName == null) { invalidMessage.append(seed); invalidMessage.append('\n'); return false; } Set<String> seedsForDomain = acceptedSeeds.get(domainName); if (seedsForDomain == null) { seedsForDomain = new HashSet<String>(); acceptedSeeds.put(domainName, seedsForDomain); } seedsForDomain.add(seed); } return true; } /** * Generate domain configurations for the accepted seeds. * * @param templateName The Heritrix template to be used. * @param maxBytes The number of max bytes allowed * @param maxObjects The number of max objected allowed * @param acceptedSeeds The set of accepted seeds */ private void addSeedsToDomain(String templateName, long maxBytes, int maxObjects, Map<String, Set<String>> acceptedSeeds, Map<String, String> attributeValues) { // Generate components for the name for the configuration and seedlist final String maxbytesSuffix = "Bytes"; String maxBytesS = "Unlimited" + maxbytesSuffix; if (maxBytes >= 0) { maxBytesS = Long.toString(maxBytes); maxBytesS = maxBytesS + maxbytesSuffix; } final String maxobjectsSuffix = "Objects"; String maxObjectsS = "Unlimited" + maxobjectsSuffix; if (maxObjects >= 0) { maxObjectsS = Long.toString(maxObjects); maxObjectsS = maxObjectsS + maxobjectsSuffix; } String name = harvestDefName + "_" + templateName + "_" + maxBytesS + "_" + maxObjectsS; Set<DomainConfiguration> newDcs = new HashSet<DomainConfiguration>(); for (Map.Entry<String, Set<String>> entry : acceptedSeeds.entrySet()) { String domainName = entry.getKey(); Domain domain; List<SeedList> seedListList = new ArrayList<SeedList>(); SeedList seedlist; // Find or create the domain if (DomainDAO.getInstance().exists(domainName)) { domain = DomainDAO.getInstance().read(domainName); // If a config with this name exists already for the dommain, add a "_" + timestamp to the end of the name to be make it unique. // This will probably happen rarely. // This name is used for both the configuration and corresponding seed if (domain.hasConfiguration(name)) { String oldName = name; name = name + "_" + System.currentTimeMillis(); log.info( "configuration '{}' for domain '{}' already exists. Change name for config and corresponding seed to ", oldName, name, domain.getName()); } seedlist = new SeedList(name, ""); // Assure that the seedname is the same as the configname. seedListList.add(seedlist); domain.addSeedList(seedlist); } else { seedlist = new SeedList(name, ""); // Assure that the seedname is the same as the configname. seedListList.add(seedlist); log.info("Creating domain {} in DomainDAO", domainName); domain = Domain.getDefaultDomain(domainName); domain.addSeedList(seedlist); DomainDAO.getInstance().create(domain); } DomainConfiguration dc = new DomainConfiguration(name, domain, seedListList, new ArrayList<Password>()); dc.setOrderXmlName(templateName); dc.setMaxBytes(maxBytes); dc.setMaxObjects(maxObjects); domain.addConfiguration(dc); log.info("Adding seeds til new configuration '{}' (id={}) for domain '{}' ", name, dc.getID(), domain.getName()); // Find the SeedList and add this seed to it seedlist = domain.getSeedList(name); List<String> currentSeeds = seedlist.getSeeds(); entry.getValue().addAll(currentSeeds); List<String> allSeeds = new ArrayList<String>(); allSeeds.addAll(entry.getValue()); domain.updateSeedList(new SeedList(name, allSeeds)); // Add the configuration to the list of new configs for // this harvest. newDcs.add(dc); DomainDAO.getInstance().update(domain); log.info("Created configuration '{}' for domain {} with ID {}", dc.getName(), dc.getDomainName(), dc.getID()); saveAttributes(dc, attributeValues); } boolean thisInDAO = HarvestDefinitionDAO.getInstance().exists(this.harvestDefName); if (thisInDAO) { // We have previously created this harvestdefinition in the HarvestDefinitionDAO. HarvestDefinitionDAO hddao = HarvestDefinitionDAO.getInstance(); for (DomainConfiguration dc : newDcs) { addConfiguration(dc); hddao.addDomainConfiguration(this, new SparseDomainConfiguration(dc)); } hddao.update(this); } else { // not yet created in the HarvestDefinitionDAO for (DomainConfiguration dc : newDcs) { addConfiguration(dc); } HarvestDefinitionDAO.getInstance().create(this); } } private void saveAttributes(DomainConfiguration dc, Map<String, String> attributeValues) { if (dc.getID() == null) { log.warn("Attributes not saved to database. Id of domainConfiguration not yet available"); return; } // EAV try { long entity_id = dc.getID(); log.info("Saving attributes for domain config id {} and name {} and domain {}", entity_id, dc.getName(), dc.getDomainName()); EAV eav = EAV.getInstance(); List<AttributeAndType> attributeTypes = eav.getAttributesAndTypes(EAV.DOMAIN_TREE_ID, (int) entity_id); log.debug("3 attributes available for entity {}", entity_id); AttributeAndType attributeAndType; AttributeTypeBase attributeType; AttributeBase attribute; for (int i = 0; i < attributeTypes.size(); ++i) { attributeAndType = attributeTypes.get(i); attributeType = attributeAndType.attributeType; log.debug("Examining attribute {}", attributeType.name); attribute = attributeAndType.attribute; if (attribute == null) { attribute = attributeType.instanceOf(); attribute.entity_id = (int) entity_id; } switch (attributeType.viewtype) { case 1: String paramValue = attributeValues.get(attributeType.name); int intValue; if (paramValue != null) { intValue = Integer.decode(paramValue); } else { intValue = attributeType.def_int; } log.info("Setting attribute {} to value {}", attributeType.name, intValue); attribute.setInteger(intValue); break; case 5: case 6: paramValue = attributeValues.get(attributeType.name); int intVal = 0; if (paramValue != null && !"0".equals(paramValue)) { intVal = 1; } log.debug("Set intVal = 1 for attribute {} when receiving paramValue={}", attributeType.name, paramValue); attribute.setInteger(intVal); break; } eav.saveAttribute(attribute); } } catch (SQLException e) { throw new RuntimeException("Unable to store EAV data!", e); } } }