Java tutorial
/* * Copyright 2012 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.ltbot.postprocessor; import static org.archive.modules.fetcher.FetchStatusCodes.S_OUT_OF_SCOPE; import java.util.Arrays; import java.util.logging.Level; import java.util.logging.Logger; import org.archive.modules.CrawlURI; import org.archive.modules.Processor; import org.archive.modules.SchedulingConstants; import org.archive.modules.extractor.Hop; import org.json.JSONObject; /** * * @author Steffen Remus **/ public class DecesiveValuePrioritizer extends Processor { private final static Logger LOG = Logger.getLogger(DecesiveValuePrioritizer.class.getName()); private long _count_reject; private final long[] _assignment_counts; private final Object _lck = new Object(); public DecesiveValuePrioritizer() { setExtraInfoValueFieldName(SharedConstants.EXTRA_INFO_PERPLEXITY); setAssignmentBoundaries("5e2,5e3,Infinity"); // one for each priority (HIGH,MEDIUM,NORMAL) HIGHER is reserved for prerequisites setMaxvalue(15); _assignment_counts = new long[4]; // one for each priority Arrays.fill(_assignment_counts, 0l); _count_reject = 0l; } public String getExtraInfoValueFieldName() { return (String) getKeyedProperties().get("ExtraInfoFieldName"); } public void setExtraInfoValueFieldName(String extraInfoFieldName) { getKeyedProperties().put("ExtraInfoFieldName", extraInfoFieldName); } public String getAssignmentBoundaries() { return (String) getKeyedProperties().get("assignmentBoundaries"); } public void setAssignmentBoundaries(String assignmentBoundaries) { String[] bounds_as_strarr = assignmentBoundaries.split(","); double[] bounds = new double[bounds_as_strarr.length + 1]; // first value is a dummy value, never used for (int i = 0; i < bounds_as_strarr.length; i++) bounds[i + 1] = Double.valueOf(bounds_as_strarr[i]); kp.put("assignmentBoundaries", assignmentBoundaries); kp.put("assignmentBoundaries_doubles", bounds); } public double getMaxvalue() { return (Double) getKeyedProperties().get("maxvalue"); } public void setMaxvalue(double value) { getKeyedProperties().put("maxvalue", value); } private double[] getAssignmentBoundaries_doubles() { return (double[]) getKeyedProperties().get("assignmentBoundaries_doubles"); } @Override protected boolean shouldProcess(CrawlURI uri) { CrawlURI via_uri = uri.getFullVia(); if (via_uri == null) return false; // no parent document to get the value from JSONObject info = via_uri.getExtraInfo(); return info.has(getExtraInfoValueFieldName()); } @Override protected void innerProcess(CrawlURI uri) throws InterruptedException { // CrawlURI is a candidate Uri which has to be scheduled according our priority synchronized (_lck) { double value = 0d; try { value = getValueFromViaURI(uri); double oldvalue = getValueFromCurrentURI(uri); if (oldvalue <= value) // URI was already processed and value was equal to or lower than the current via url return; } catch (Throwable t) { for (int i = 1; t != null && i < 10; i++) { LOG.log(Level.WARNING, String.format("Failed to get decisive value from extra info: (%d-%s:%s).", i, t.getClass().getName(), t.getMessage()), t); t = t.getCause(); } return; } int schedulingConstants_priority = getPriorityAsSchedulingDirective(value); if (uri.getFullVia().isSeed() && uri.getLastHop().equals(Hop.REFER.getHopString())) schedulingConstants_priority = SchedulingConstants.HIGHEST; if (schedulingConstants_priority < 0) { _count_reject++; uri.setFetchStatus(S_OUT_OF_SCOPE); // this will not consider the url for further processing // TODO: there must be a better solution, maybe extend org.archive.crawler.prefetch.FrontierPreparer or org.archive.crawler.prefetch.CandidateScoper } uri.setSchedulingDirective(schedulingConstants_priority); if (schedulingConstants_priority >= 0) _assignment_counts[schedulingConstants_priority]++; LOG.finest(String.format("Assigned scheduling directive %d to %s.", schedulingConstants_priority, uri.toString())); // lower values have higher precedence, i.e. higher priority int cost = 255; if (schedulingConstants_priority == 0) cost = 1; else if (schedulingConstants_priority > 0) cost = getPrecedenceCost(value); uri.setHolderCost(cost); uri.setPrecedence(cost); LOG.finest(String.format("Assigned precedence cost %d to %s.", cost, uri.toString())); try { uri.getExtraInfo().put(SharedConstants.EXTRA_INFO_ASSIGNED_SCHEDULING_DIRECTIVE, schedulingConstants_priority); uri.getExtraInfo().put(SharedConstants.EXTRA_INFO_ASSIGNED_COST_PRECEDENCE, cost); uri.getExtraInfo().put(SharedConstants.EXTRA_INFO_PERPLEXITY_VIA, String.format("%012g", value)); } catch (Throwable t) { for (int i = 1; t != null && i < 10; i++) { LOG.log(Level.WARNING, String.format("Failed to add extra information to uri %s: (%d-%s:%s).", uri.toString(), i, t.getClass().getName(), t.getMessage()), t); t = t.getCause(); } } } } double getValueFromCurrentURI(CrawlURI uri) throws IllegalStateException { try { JSONObject info = uri.getExtraInfo(); if (info.length() == 0 || !info.has(SharedConstants.EXTRA_INFO_PERPLEXITY_VIA)) return Double.MAX_VALUE; double value = Double.valueOf(info.getString(SharedConstants.EXTRA_INFO_PERPLEXITY_VIA)); return value; } catch (Throwable t) { throw new IllegalStateException(String.format("Failed to get value from field %s (%s).", SharedConstants.EXTRA_INFO_PERPLEXITY_VIA, uri.toString())); } } double getValueFromViaURI(CrawlURI uri) throws IllegalStateException { try { JSONObject info = uri.getFullVia().getExtraInfo(); if (info.length() == 0 || !info.has(getExtraInfoValueFieldName())) throw new Exception("Extra info is empty."); double value = Double.valueOf(info.getString(getExtraInfoValueFieldName())); return value; } catch (Throwable t) { throw new IllegalStateException(String.format("Failed to get value from field %s (%s).", getExtraInfoValueFieldName(), uri.toString())); } } int getPrecedenceCost(double val) { // cost needs to be in [1,255], lower values are better, try to squash values into this range if (val == Double.NaN) return 255; // highest possible value // [A, B] --> [a, b] // newval = (val - A)*(b-a)/(B-A) + a // A,a = 0 // newval = val * (b/B) double B = Math.min(Integer.MAX_VALUE, Double.valueOf(getMaxvalue())); // use maxvalue as maximum if specified and below Integer.maxvalue double b = 253; // reserve 255 for everything that is greater than B if (val >= B) return 255; int precedence_cost = (int) Math.floor(val * (b / B)) + 1; return precedence_cost; } int getPriorityAsSchedulingDirective(double perplexity) { double[] bounds = getAssignmentBoundaries_doubles(); if (perplexity < 0d || perplexity == Double.NaN) return -1; // best remove from frontier // HIGHEST = 0, HIGH = 1, ... but reserve HIGHEST for prerequistes if (perplexity <= bounds[SchedulingConstants.HIGH]) return SchedulingConstants.HIGH; // higher than medium if (perplexity <= bounds[SchedulingConstants.MEDIUM]) return SchedulingConstants.MEDIUM; // higher than normal if (perplexity <= bounds[SchedulingConstants.NORMAL]) return SchedulingConstants.NORMAL; // default // else best remove from frontier return -1; } static void addExtraInfo(CrawlURI uri, String key, Object value) { try { uri.getExtraInfo().put(key, value); } catch (Throwable t) { /* NOTHING SHOULD HAPPEN HERE. AND IN THE UNLIKELY CASE THAT SOMETHING HAPPENS I DO NOT CARE ABOUT IT. */ } } @Override public String report() { double[] bounds = getAssignmentBoundaries_doubles(); StringBuilder sb = new StringBuilder(); sb.append(String.format("Processor: %s %n", getClass().getName())); sb.append(String.format(" SchedulingConstants assignment counts: [%n", getClass().getName())); sb.append(String.format(" %d '%s' (<= %g); %n", _assignment_counts[SchedulingConstants.HIGHEST], "HIGHEST", bounds[SchedulingConstants.HIGHEST])); sb.append(String.format(" %d '%s' (<= %g); %n", _assignment_counts[SchedulingConstants.HIGH], "HIGH", bounds[SchedulingConstants.HIGH])); sb.append(String.format(" %d '%s' (<= %g); %n", _assignment_counts[SchedulingConstants.MEDIUM], "MEDIUM", bounds[SchedulingConstants.MEDIUM])); sb.append(String.format(" %d '%s' (<= %g); %n", _assignment_counts[SchedulingConstants.NORMAL], "NORMAL", bounds[SchedulingConstants.NORMAL])); sb.append(String.format(" %d '%s'%n", _count_reject, "REJECTED")); sb.append(String.format(" ]%n")); return sb.toString(); } }