de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValuePrioritizer.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValuePrioritizer.java

Source

/*
 *   Copyright 2012
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.ltbot.postprocessor;

import static org.archive.modules.fetcher.FetchStatusCodes.S_OUT_OF_SCOPE;

import java.util.Arrays;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.SchedulingConstants;
import org.archive.modules.extractor.Hop;
import org.json.JSONObject;

/**
 *
 * @author Steffen Remus
 **/
public class DecesiveValuePrioritizer extends Processor {

    private final static Logger LOG = Logger.getLogger(DecesiveValuePrioritizer.class.getName());

    private long _count_reject;
    private final long[] _assignment_counts;

    private final Object _lck = new Object();

    public DecesiveValuePrioritizer() {
        setExtraInfoValueFieldName(SharedConstants.EXTRA_INFO_PERPLEXITY);
        setAssignmentBoundaries("5e2,5e3,Infinity"); // one for each priority (HIGH,MEDIUM,NORMAL) HIGHER is reserved for prerequisites
        setMaxvalue(15);
        _assignment_counts = new long[4]; // one for each priority
        Arrays.fill(_assignment_counts, 0l);
        _count_reject = 0l;
    }

    public String getExtraInfoValueFieldName() {
        return (String) getKeyedProperties().get("ExtraInfoFieldName");
    }

    public void setExtraInfoValueFieldName(String extraInfoFieldName) {
        getKeyedProperties().put("ExtraInfoFieldName", extraInfoFieldName);
    }

    public String getAssignmentBoundaries() {
        return (String) getKeyedProperties().get("assignmentBoundaries");
    }

    public void setAssignmentBoundaries(String assignmentBoundaries) {
        String[] bounds_as_strarr = assignmentBoundaries.split(",");
        double[] bounds = new double[bounds_as_strarr.length + 1]; // first value is a dummy value, never used
        for (int i = 0; i < bounds_as_strarr.length; i++)
            bounds[i + 1] = Double.valueOf(bounds_as_strarr[i]);
        kp.put("assignmentBoundaries", assignmentBoundaries);
        kp.put("assignmentBoundaries_doubles", bounds);
    }

    public double getMaxvalue() {
        return (Double) getKeyedProperties().get("maxvalue");
    }

    public void setMaxvalue(double value) {
        getKeyedProperties().put("maxvalue", value);
    }

    private double[] getAssignmentBoundaries_doubles() {
        return (double[]) getKeyedProperties().get("assignmentBoundaries_doubles");
    }

    @Override
    protected boolean shouldProcess(CrawlURI uri) {
        CrawlURI via_uri = uri.getFullVia();
        if (via_uri == null)
            return false; // no parent document to get the value from
        JSONObject info = via_uri.getExtraInfo();
        return info.has(getExtraInfoValueFieldName());
    }

    @Override
    protected void innerProcess(CrawlURI uri) throws InterruptedException {
        // CrawlURI is a candidate Uri which has to be scheduled according our priority
        synchronized (_lck) {
            double value = 0d;
            try {
                value = getValueFromViaURI(uri);
                double oldvalue = getValueFromCurrentURI(uri);
                if (oldvalue <= value) // URI was already processed and value was equal to or lower than the current via url
                    return;
            } catch (Throwable t) {
                for (int i = 1; t != null && i < 10; i++) {
                    LOG.log(Level.WARNING,
                            String.format("Failed to get decisive value from extra info: (%d-%s:%s).", i,
                                    t.getClass().getName(), t.getMessage()),
                            t);
                    t = t.getCause();
                }
                return;
            }

            int schedulingConstants_priority = getPriorityAsSchedulingDirective(value);
            if (uri.getFullVia().isSeed() && uri.getLastHop().equals(Hop.REFER.getHopString()))
                schedulingConstants_priority = SchedulingConstants.HIGHEST;

            if (schedulingConstants_priority < 0) {
                _count_reject++;
                uri.setFetchStatus(S_OUT_OF_SCOPE); // this will not consider the url for further processing // TODO: there must be a better solution, maybe extend org.archive.crawler.prefetch.FrontierPreparer or org.archive.crawler.prefetch.CandidateScoper
            }

            uri.setSchedulingDirective(schedulingConstants_priority);
            if (schedulingConstants_priority >= 0)
                _assignment_counts[schedulingConstants_priority]++;
            LOG.finest(String.format("Assigned scheduling directive %d to %s.", schedulingConstants_priority,
                    uri.toString()));
            // lower values have higher precedence, i.e. higher priority
            int cost = 255;
            if (schedulingConstants_priority == 0)
                cost = 1;
            else if (schedulingConstants_priority > 0)
                cost = getPrecedenceCost(value);
            uri.setHolderCost(cost);
            uri.setPrecedence(cost);
            LOG.finest(String.format("Assigned precedence cost %d to %s.", cost, uri.toString()));
            try {
                uri.getExtraInfo().put(SharedConstants.EXTRA_INFO_ASSIGNED_SCHEDULING_DIRECTIVE,
                        schedulingConstants_priority);
                uri.getExtraInfo().put(SharedConstants.EXTRA_INFO_ASSIGNED_COST_PRECEDENCE, cost);
                uri.getExtraInfo().put(SharedConstants.EXTRA_INFO_PERPLEXITY_VIA, String.format("%012g", value));
            } catch (Throwable t) {
                for (int i = 1; t != null && i < 10; i++) {
                    LOG.log(Level.WARNING, String.format("Failed to add extra information to uri %s: (%d-%s:%s).",
                            uri.toString(), i, t.getClass().getName(), t.getMessage()), t);
                    t = t.getCause();
                }
            }
        }
    }

    double getValueFromCurrentURI(CrawlURI uri) throws IllegalStateException {
        try {
            JSONObject info = uri.getExtraInfo();
            if (info.length() == 0 || !info.has(SharedConstants.EXTRA_INFO_PERPLEXITY_VIA))
                return Double.MAX_VALUE;
            double value = Double.valueOf(info.getString(SharedConstants.EXTRA_INFO_PERPLEXITY_VIA));
            return value;
        } catch (Throwable t) {
            throw new IllegalStateException(String.format("Failed to get value from field %s (%s).",
                    SharedConstants.EXTRA_INFO_PERPLEXITY_VIA, uri.toString()));
        }
    }

    double getValueFromViaURI(CrawlURI uri) throws IllegalStateException {
        try {
            JSONObject info = uri.getFullVia().getExtraInfo();
            if (info.length() == 0 || !info.has(getExtraInfoValueFieldName()))
                throw new Exception("Extra info is empty.");
            double value = Double.valueOf(info.getString(getExtraInfoValueFieldName()));
            return value;
        } catch (Throwable t) {
            throw new IllegalStateException(String.format("Failed to get value from field %s (%s).",
                    getExtraInfoValueFieldName(), uri.toString()));
        }
    }

    int getPrecedenceCost(double val) {
        // cost needs to be in [1,255], lower values are better, try to squash values into this range
        if (val == Double.NaN)
            return 255; // highest possible value
        //            [A, B] --> [a, b]
        //            newval = (val - A)*(b-a)/(B-A) + a
        //            A,a = 0
        //            newval = val * (b/B)
        double B = Math.min(Integer.MAX_VALUE, Double.valueOf(getMaxvalue())); // use maxvalue as maximum if specified and below Integer.maxvalue
        double b = 253; // reserve 255 for everything that is greater than B
        if (val >= B)
            return 255;
        int precedence_cost = (int) Math.floor(val * (b / B)) + 1;
        return precedence_cost;
    }

    int getPriorityAsSchedulingDirective(double perplexity) {
        double[] bounds = getAssignmentBoundaries_doubles();
        if (perplexity < 0d || perplexity == Double.NaN)
            return -1; // best remove from frontier
        // HIGHEST = 0, HIGH = 1, ... but reserve HIGHEST for prerequistes
        if (perplexity <= bounds[SchedulingConstants.HIGH])
            return SchedulingConstants.HIGH; // higher than medium
        if (perplexity <= bounds[SchedulingConstants.MEDIUM])
            return SchedulingConstants.MEDIUM; // higher than normal
        if (perplexity <= bounds[SchedulingConstants.NORMAL])
            return SchedulingConstants.NORMAL; // default
        // else best remove from frontier
        return -1;
    }

    static void addExtraInfo(CrawlURI uri, String key, Object value) {
        try {
            uri.getExtraInfo().put(key, value);
        } catch (Throwable t) {
            /* NOTHING SHOULD HAPPEN HERE. AND IN THE UNLIKELY CASE THAT SOMETHING HAPPENS I DO NOT CARE ABOUT IT. */
        }
    }

    @Override
    public String report() {
        double[] bounds = getAssignmentBoundaries_doubles();
        StringBuilder sb = new StringBuilder();
        sb.append(String.format("Processor: %s %n", getClass().getName()));
        sb.append(String.format("  SchedulingConstants assignment counts: [%n", getClass().getName()));
        sb.append(String.format("    %d '%s' (<= %g); %n", _assignment_counts[SchedulingConstants.HIGHEST],
                "HIGHEST", bounds[SchedulingConstants.HIGHEST]));
        sb.append(String.format("    %d '%s' (<= %g); %n", _assignment_counts[SchedulingConstants.HIGH], "HIGH",
                bounds[SchedulingConstants.HIGH]));
        sb.append(String.format("    %d '%s' (<= %g); %n", _assignment_counts[SchedulingConstants.MEDIUM], "MEDIUM",
                bounds[SchedulingConstants.MEDIUM]));
        sb.append(String.format("    %d '%s' (<= %g); %n", _assignment_counts[SchedulingConstants.NORMAL], "NORMAL",
                bounds[SchedulingConstants.NORMAL]));
        sb.append(String.format("    %d '%s'%n", _count_reject, "REJECTED"));
        sb.append(String.format("  ]%n"));
        return sb.toString();
    }

}