Java tutorial
/* * Copyright 2012 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.ltbot.prefetch; import static org.archive.modules.fetcher.FetchStatusCodes.S_OUT_OF_SCOPE; import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Logger; import org.apache.commons.httpclient.URIException; import org.archive.crawler.framework.Scoper; import org.archive.modules.CrawlURI; import org.archive.modules.ProcessResult; import org.archive.modules.SchedulingConstants; import org.archive.modules.extractor.Hop; import org.archive.net.UURI; import de.tudarmstadt.lt.ltbot.postprocessor.SharedConstants; /** * * @author Steffen Remus **/ public class DecesiveValuePrioritizer extends Scoper { private final static Logger LOG = Logger.getLogger(DecesiveValuePrioritizer.class.getName()); private AtomicLong _count_reject; private final AtomicLong[] _assignment_counts; protected double[] _assignmentBoundaries; public DecesiveValuePrioritizer() { setExtraInfoValueFieldName(SharedConstants.EXTRA_INFO_PERPLEXITY); setAssignmentBoundaries("5e2,5e3,Infinity"); // one for each priority (HIGH,MEDIUM,NORMAL) HIGHER is reserved for prerequisites setMaxValue(5e4); setMaxPrecedence(127); _assignment_counts = new AtomicLong[4]; // one for each priority for (int i = 0; i < _assignment_counts.length; _assignment_counts[i++] = new AtomicLong()) ; _count_reject = new AtomicLong(); } public String getExtraInfoValueFieldName() { return (String) getKeyedProperties().get("ExtraInfoFieldName"); } public void setExtraInfoValueFieldName(String extraInfoFieldName) { getKeyedProperties().put("ExtraInfoFieldName", extraInfoFieldName); } public String getAssignmentBoundaries() { return (String) getKeyedProperties().get("assignmentBoundaries"); } public void setAssignmentBoundaries(String assignmentBoundaries) { String[] bounds_as_strarr = assignmentBoundaries.split(","); double[] bounds = new double[bounds_as_strarr.length + 1]; // first value is a dummy value, never used for (int i = 0; i < bounds_as_strarr.length; i++) bounds[i + 1] = Double.valueOf(bounds_as_strarr[i]); kp.put("assignmentBoundaries", assignmentBoundaries); _assignmentBoundaries = bounds; } protected double _maxvalue; public double getMaxvalue() { return _maxvalue; } public void setMaxValue(double value) { _maxvalue = value; } protected int _maxPrecedence; public int getMaxPrecedence() { return _maxPrecedence; } public void setMaxPrecedence(int value) { _maxPrecedence = value; } @Override protected boolean shouldProcess(CrawlURI uri) { return true; } @Override protected void innerProcess(CrawlURI uri) throws InterruptedException { super.innerProcessResult(uri); } @Override protected ProcessResult innerProcessResult(CrawlURI uri) throws InterruptedException { CrawlURI via_uri = uri.getFullVia(); return innerProcessResult(uri, via_uri, 0); } protected ProcessResult innerProcessResult(CrawlURI uri, CrawlURI via_uri, int recursion_count) throws InterruptedException { // uri is seed if (uri.isSeed()) { LOG.info(uri + " isSeed"); return schedule(uri, via_uri, 2d, 0, false, ""); } boolean uri_is_redirect = uri.getLastHop().equals(Hop.REFER.getHopString());//s || uri.getLastHop().equals(Hop.EMBED.getHopString()); boolean uri_is_robotstxt = isRobotstxt(uri); if (via_uri == null) { String debug_uri = String.format( "%s:%n%s\ndirective / cost %s/%s%n %s --> via %s (%s)%n --> fullvia %s%n --> recursive via %d %s%n", uri.toString() + (uri.isPrerequisite() ? " is preq" : "") + (uri_is_redirect ? " is redirect" : "") + (uri.isSeed() ? " is seed" : "") + (uri_is_robotstxt ? " is robots.txt" : ""), uri.getData(), uri.getSchedulingDirective(), uri.getPrecedence(), uri.getPathFromSeed(), uri.getVia(), uri.flattenVia(), uri.getFullVia(), recursion_count, via_uri); LOG.warning(String.format("'%s' has no via URL %s / %s.%n+++START+++%n%s%n+++END+++%n", uri, uri.flattenVia(), via_uri, debug_uri)); return ProcessResult.FINISH; } boolean via_is_redirect = via_uri.getLastHop().equals(Hop.REFER.getHopString()) || via_uri.getLastHop().equals(Hop.EMBED.getHopString()); //|| (via_uri.getFetchStatus() / 100) == 3 /* redirect status code */ boolean via_uri_is_robotstxt = isRobotstxt(via_uri); String debug_uri = String.format( "%s:%n%s\ndirective / cost %s/%s %n%s %d ---> via %s%n%s\ndirective / cost %s/%s", uri.toString() + (uri.isPrerequisite() ? " is preq" : "") + (uri_is_redirect ? " is redirect" : "") + (uri.isSeed() ? " is seed" : "") + (uri_is_robotstxt ? " is robots.txt" : ""), uri.getData(), uri.getSchedulingDirective(), uri.getPrecedence(), uri.getPathFromSeed(), recursion_count, via_uri.toString() + (via_uri.isPrerequisite() ? " is preq" : "") + (via_is_redirect ? " is redirect" : "") + (via_uri.isSeed() ? " is seed" : "") + (via_uri_is_robotstxt ? " is robots.txt" : ""), via_uri.getData(), via_uri.getSchedulingDirective(), via_uri.getPrecedence()); LOG.finest(debug_uri); // double perplexity = _maxvalue; Object perpobj = getExtraInfo(via_uri, getExtraInfoValueFieldName()); if (perpobj == null) { if (via_uri.isSeed()) return schedule(uri, via_uri, 2d + recursion_count + 1, recursion_count + 1, false, debug_uri); if (via_uri_is_robotstxt) return schedule(uri, via_uri, Math.min(_maxvalue, _assignmentBoundaries[SchedulingConstants.NORMAL]) - 1d, recursion_count + 1, true, debug_uri); return innerProcessResult(uri, via_uri.getFullVia(), recursion_count + 1); } // if(uri_is_redirect || uri.isPrerequisite() || via_uri.isPrerequisite()) // uri is embed/refer/prereq/... // return scheduleSame(uri, via_uri); // else double perplexity = Double.valueOf((String) perpobj);//getPerplexity(uri, via_uri, debug_uri); return schedule(uri, via_uri, perplexity, recursion_count, false, debug_uri); } double getPerplexity(CrawlURI uri, CrawlURI via_uri, String debug_uri) { Object obj = getExtraInfo(via_uri, getExtraInfoValueFieldName()); if (obj == null) { LOG.info(String.format("%s - (%s)\tno priority value found at field %s.%n+++BEGIN+++%n%s%n+++END+++%n", via_uri, uri.flattenVia(), getExtraInfoValueFieldName(), debug_uri)); // FIXME: schedule same in such a case? // perplexity = getPerplexity(uri, via_uri, getExtraInfoValueFieldName() + "_via"); // LOG.warning(String.format("unable to schedule %s.", uri)); // obj = getExtraInfo(via_uri, getExtraInfoValueFieldName() + "_via"); // if(obj == null){ // LOG.warning(String.format("%s - (%s)\tno priority value found at field %s.", via_uri, uri.flattenVia(), getExtraInfoValueFieldName()+"_via")); return _maxvalue; // } } double value = Double.valueOf((String) obj); return value; } ProcessResult schedule(CrawlURI uri, CrawlURI via_uri, double perplexity, int recursion_count, boolean copy_perp, String debug_uri) { LOG.fine(String.format("Perplexity %s = %f", uri.toString(), perplexity)); int schedulingdirective = getPriorityAsSchedulingDirective(perplexity); if (schedulingdirective < 0) { // "forget" url _count_reject.incrementAndGet(); uri.clearPrerequisiteUri(); uri.setFetchStatus(S_OUT_OF_SCOPE); // this will not consider the url for further processing // TODO: there must be a better solution, maybe extend org.archive.crawler.prefetch.FrontierPreparer or org.archive.crawler.prefetch.CandidateScoper uri.setSchedulingDirective(SchedulingConstants.NORMAL); // just in case, put it into the least important bucket uri.setHolderCost(_maxPrecedence); uri.setPrecedence(_maxPrecedence); uri.addExtraInfo(getExtraInfoValueFieldName() + "_via", String.format("%012g", perplexity)); LOG.fine(String.format("Assigned scheduling directive %d to %s.", uri.getSchedulingDirective(), uri.toString())); LOG.fine(String.format("Assigned precedence cost %d to %s.", uri.getHolderCost(), uri.toString())); return ProcessResult.FINISH; } int cost = getPrecedenceCost(perplexity, schedulingdirective); cost = Math.max(cost - recursion_count, 0); if (uri.isPrerequisite()) uri.setSchedulingDirective(Math.max(SchedulingConstants.HIGHEST, schedulingdirective - 1)); else uri.setSchedulingDirective(schedulingdirective); uri.setHolderCost(cost); uri.setPrecedence(cost); _assignment_counts[schedulingdirective].incrementAndGet(); LOG.fine(String.format("Assigned scheduling directive %d to %s.", schedulingdirective, uri.toString())); LOG.fine(String.format("Assigned precedence cost %d to %s.", cost, uri.toString())); addExtraInfo(uri, SharedConstants.EXTRA_INFO_ASSIGNED_SCHEDULING_DIRECTIVE, schedulingdirective); addExtraInfo(uri, SharedConstants.EXTRA_INFO_ASSIGNED_COST_PRECEDENCE, cost); addExtraInfo(uri, getExtraInfoValueFieldName() + "_via", String.format("%012g", perplexity)); if (copy_perp) addExtraInfo(uri, getExtraInfoValueFieldName(), String.format("%012g", perplexity)); return ProcessResult.PROCEED; } ProcessResult scheduleSame(CrawlURI uri, CrawlURI via_uri) { if (uri.isPrerequisite()) { if (via_uri.isSeed() && !uri.getUURI().getScheme().equals("dns")) uri.setSchedulingDirective( Math.max(SchedulingConstants.HIGHEST, via_uri.getSchedulingDirective() - 1)); else uri.setSchedulingDirective(via_uri.getSchedulingDirective()); if (!via_uri.isPrerequisite()) uri.setSchedulingDirective( Math.max(SchedulingConstants.HIGH, via_uri.getSchedulingDirective() - 1)); } else uri.setSchedulingDirective(via_uri.getSchedulingDirective()); int cost = Math.max(0, via_uri.getHolderCost() - 1); uri.setHolderCost(cost); uri.setPrecedence(cost); _assignment_counts[uri.getSchedulingDirective()].incrementAndGet(); LOG.finest(String.format("Assigned scheduling directive %d to %s.", uri.getSchedulingDirective(), uri.toString())); LOG.finest(String.format("Assigned precedence cost %d to %s.", uri.getHolderCost(), uri.toString())); addExtraInfo(uri, SharedConstants.EXTRA_INFO_ASSIGNED_SCHEDULING_DIRECTIVE, uri.getSchedulingDirective()); addExtraInfo(uri, SharedConstants.EXTRA_INFO_ASSIGNED_COST_PRECEDENCE, uri.getHolderCost()); Object val = via_uri.getData().get(getExtraInfoValueFieldName()); if (val != null) { uri.getData().put(getExtraInfoValueFieldName(), val); uri.addExtraInfo(getExtraInfoValueFieldName(), val); } val = via_uri.getData().get(getExtraInfoValueFieldName() + "_via"); if (val != null) { uri.getData().put(getExtraInfoValueFieldName() + "_via", val); uri.addExtraInfo(getExtraInfoValueFieldName() + "_via", val); } return ProcessResult.PROCEED; } Object getExtraInfo(CrawlURI uri, String key) throws IllegalStateException { // if present in data map return value if (uri.getData().containsKey(key)) return uri.getData().get(key); // else check if its available in extra info json object try { if (uri.getExtraInfo().has(key)) return uri.getExtraInfo().get(key); } catch (Throwable t) { /* NOTHING SHOULD HAPPEN HERE. AND IN THE UNLIKELY CASE THAT SOMETHING HAPPENS I DO NOT CARE ABOUT IT. */ } return null; } int getPrecedenceCost(double val, int schedulingConstants_priority) { // cost should be in [0,_maxPrecedence], lower values are better, try to squash values into this range int cost = _maxPrecedence; switch (schedulingConstants_priority) { case SchedulingConstants.HIGHEST: return 1; // 2^0 case SchedulingConstants.HIGH: return 4; // 2^2 case SchedulingConstants.MEDIUM: return 8; // 2^3 case SchedulingConstants.NORMAL: cost = 64; // 2^6 } // --> squeeze [_assignmentBoundaries[SchedulingConstants.NORMAL], _maxvalue] into [64, _maxPrecedence] // [A, B] --> [a, b] // newval = (val - A)*(b-a)/(B-A) + a double B = Math.min(Integer.MAX_VALUE, _maxvalue); // use maxvalue as maximum if specified and below Integer.maxvalue if (val >= B) return _maxPrecedence; // take log of A, B and val to make computation numerically more stable B = Math.log1p(B); double A = Math.log1p(_assignmentBoundaries[SchedulingConstants.MEDIUM]); val = Math.log1p(val); // assert val >= A : "Value is smaller than lower boundary. That should not happen."; double a = cost; double b = _maxPrecedence; cost = (int) Math.ceil((val - A) * (b - a) / (B - A) + a); return cost; } int getPriorityAsSchedulingDirective(double perplexity) { if (perplexity <= 1d) return -1; // remove from frontier if (!Double.isFinite(perplexity)) if (!Double.isFinite(_assignmentBoundaries[SchedulingConstants.NORMAL])) return SchedulingConstants.NORMAL; // default else return -1; // remove // HIGHEST = 0, HIGH = 1, ... but reserve HIGHEST for prerequistes if (perplexity <= _assignmentBoundaries[SchedulingConstants.HIGH]) return SchedulingConstants.HIGH; // higher than medium if (perplexity <= _assignmentBoundaries[SchedulingConstants.MEDIUM]) return SchedulingConstants.MEDIUM; // higher than normal if (perplexity <= _assignmentBoundaries[SchedulingConstants.NORMAL]) return SchedulingConstants.NORMAL; // default // else best remove from frontier // should not happen // assert false : "You should not be here"; return -1; } static void addExtraInfo(CrawlURI uri, String key, Object value) { uri.getData().put(key, value); try { uri.getExtraInfo().put(key, value); } catch (Throwable t) { /* NOTHING SHOULD HAPPEN HERE. AND IN THE UNLIKELY CASE THAT SOMETHING HAPPENS I DO NOT CARE ABOUT IT. */ } } @Override public String report() { StringBuilder sb = new StringBuilder(); sb.append(String.format("Processor: %s %n", getClass().getName())); sb.append(String.format(" SchedulingConstants assignment counts: [%n", getClass().getName())); sb.append(String.format(" %d '%s' (%s); %n", _assignment_counts[SchedulingConstants.HIGHEST].get(), "HIGHEST", "Prerequisites")); sb.append(String.format(" %d '%s' (<= %g); %n", _assignment_counts[SchedulingConstants.HIGH].get(), "HIGH", _assignmentBoundaries[SchedulingConstants.HIGH])); sb.append(String.format(" %d '%s' (<= %g); %n", _assignment_counts[SchedulingConstants.MEDIUM].get(), "MEDIUM", _assignmentBoundaries[SchedulingConstants.MEDIUM])); sb.append(String.format(" %d '%s' (<= %g); %n", _assignment_counts[SchedulingConstants.NORMAL].get(), "NORMAL", _assignmentBoundaries[SchedulingConstants.NORMAL])); sb.append(String.format(" %d '%s'%n", _count_reject.get(), "REJECTED")); sb.append(String.format(" ]%n")); return sb.toString(); } private static boolean isRobotstxt(CrawlURI curi) { UURI uuri = curi.getUURI(); try { return uuri != null && uuri.getPath() != null && curi.getUURI().getPath().equals("/robots.txt"); } catch (URIException e) { LOG.severe("Failed get of path for " + curi); return false; } } }