package com.flaptor.hounder.crawler.modules;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.Map.Entry;

import org.apache.log4j.Logger;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;

import com.flaptor.hounder.crawler.pagedb.Page;
import com.flaptor.hounder.crawler.pagedb.PageDB;
import com.flaptor.hounder.crawler.APageMapper;
import com.flaptor.hounder.crawler.UrlHashMapper;
import com.flaptor.hounder.indexer.IRemoteIndexer;
import com.flaptor.hounder.indexer.Indexer;
import com.flaptor.hounder.indexer.IndexerReturnCode;
import com.flaptor.hounder.indexer.MockIndexer;
import com.flaptor.hounder.indexer.RmiIndexerStub;
import com.flaptor.util.Config;
import com.flaptor.util.DomUtil;
import com.flaptor.util.Execute;
import com.flaptor.util.Pair;
import com.flaptor.util.PortUtil;
import com.flaptor.util.QuadCurve;

 * Indexer Module for FetchdataProcessor.
 *  @todo check how to give better error messages. Those logged now can not 
 *  help to identify the problematic document.
 * @author Flaptor Development Team
public class IndexerModule extends AProcessorModule {
    private static final Logger logger = Logger.getLogger(Execute.whoAmI());
    private int textLengthLimit; // the maximum allowed page text length.
    private int titleLengthLimit; // the maximum allowed page title length.
    private int indexerBusyRetryTime; // time in seconds between retries when the indexer is busy.
    private int categoryBoostDamp; // the amount of damping for the categoryBoost value in the boost formula.
    private int pagerankBoostDamp; // the amount of damping for the pagerankBoost value in the boost formula.
    private int spamrankBoostDamp; // the amount of damping for the spamrankBoost value in the boost formula.
    private int logBoostDamp; // the amount of damping for the log value in the boost formula.
    private int freshnessBoostDamp; // the amount of damping for the freshnessBoost value in the boost formula.
    private int freshnessWeight; // the weight of the freshness parameter
    private int freshnessDamp; // the number of days at which the freshness value halves.
    private IRemoteIndexer[] indexers; // a list of Hounder indexer.
    private APageMapper pageMapper; // a mapper to choose an indexer for a given page.
    private String crawlName; // the name of the crawl, added to the index so searches can be restricted to the results of this crawler.
    private float[] scoreThreshold; // the values for the (0 to 100 step 10) percentiles in the page score histogram.
    private float[] antiScoreThreshold; // the values for the (0 to 100 step 10) percentiles in the page anti-score histogram.
    private HashSet hostStopWords; // the parts of web host names that are not interesting, like www.
    private boolean sendContent; // if true the page content will be sent to the indexer in a <body> tag.

    public IndexerModule(String moduleName, Config globalConfig) {
        super(moduleName, globalConfig);
        textLengthLimit = globalConfig.getInt("page.text.max.length");
        titleLengthLimit = globalConfig.getInt("page.title.max.length");
        Config mdlConfig = getModuleConfig();
        indexerBusyRetryTime = mdlConfig.getInt("indexer.busy.retry.time");
        crawlName = globalConfig.getString("");

        categoryBoostDamp = weightToDamp(mdlConfig.getFloat("category.boost.weight"));
        pagerankBoostDamp = weightToDamp(mdlConfig.getFloat("pagerank.boost.weight"));
        spamrankBoostDamp = weightToDamp(mdlConfig.getFloat("spamrank.boost.weight"));
        logBoostDamp = weightToDamp(mdlConfig.getFloat("log.boost.weight"));
        freshnessBoostDamp = weightToDamp(mdlConfig.getFloat("freshness.boost.weight"));
        int[] freshnessParams = mdlConfig.getIntArray("freshness.params");
        freshnessWeight = freshnessParams[0];
        freshnessDamp = freshnessParams[1];
        hostStopWords = new HashSet<String>(Arrays.asList(mdlConfig.getStringArray("host.stopwords")));
        sendContent = mdlConfig.getBoolean("");

        // instantiate the indexer.
        if (mdlConfig.getBoolean("use.mock.indexer")) {
            logger.warn("Using a mock indexer. This should be used only for testing.");
            this.indexers = new IRemoteIndexer[1];
            this.indexers[0] = new MockIndexer();
            pageMapper = new UrlHashMapper(mdlConfig, 1);
        } else {
            String[] specs = null;
            try {
                specs = mdlConfig.getStringArray("indexer.node.list");
            } catch (Exception e) {
                // for backward compatibility:
                specs = mdlConfig.getStringArray("");
            this.indexers = new IRemoteIndexer[specs.length];
            for (int i = 0; i < specs.length; i++) {
                Pair<String, Integer> host = PortUtil.parseHost(specs[i], "indexer.rmi");
                this.indexers[i] = new RmiIndexerStub(host.last(), host.first());
            pageMapper = getPageMapper(mdlConfig, specs.length);

    public void close() {

    // Return the configured page mapper.
    private APageMapper getPageMapper(Config config, int numberOfNodes) {
        String[] parts = config.getStringArray("indexer.node.mapper");
        if (null == parts || 0 == parts.length) {
            throw new RuntimeException("No mapper defined for the distributed indexer");
        String mapperClass = parts[0].trim();
        Config mapperConfig = config;
        if (parts.length > 1) {
            String mapperName = parts[1].trim();
            mapperConfig = Config.getConfig(mapperName + "");
        APageMapper mapper;
        try {
            mapper = (APageMapper) Class.forName(mapperClass)
                    .getConstructor(new Class[] { Config.class, Integer.TYPE })
                    .newInstance(new Object[] { mapperConfig, numberOfNodes });
        } catch (Exception e) {
            throw new RuntimeException(e);
        return mapper;

     * @todo the pageDB is checked for null, but should be checked for
     *   fetchedSize
    protected void internalProcess(FetchDocument doc) {

        Page page = doc.getPage();
        if (null == page) {
            logger.warn("Page is null. Ignoring this document.");
        if (logger.isDebugEnabled()) {
            logger.debug("Doc has tags: " + doc.getTags().toString());
        if (doc.hasTag(EMIT_DOC_TAG)) {
        } else {
            if (page.isEmitted()) {

    // Delete a page from the index
    private void deleteFromIndex(Page page) {
        org.dom4j.Document dom = DocumentHelper.createDocument();
        Element root = dom.addElement("documentDelete");

        try {
            int i = pageMapper.mapPage(page);
            while (indexers[i].index(dom) == IndexerReturnCode.RETRY_QUEUE_FULL) {
                try {
                    Thread.sleep(indexerBusyRetryTime * 1000);
                } catch (InterruptedException e) {
                    logger.debug("Sleep interrupted: " + e, e);
        } catch (Exception e) {
            logger.error(e, e);

    // Calcuate the category boost. TODO: range?
    private float calculateCategoryBoost(Map<String, Object> attr) {
        Double categoryBoost = (Double) attr.get("categroy_boost");
        if (null == categoryBoost) {
            categoryBoost = 1d;
        return categoryBoost.floatValue();

    // Calcuate the pagerank boost, range 0.1 - 10.
    private float calculatePagerankBoost(Page page) {
        // page rank 
        float score = page.getScore();
        int bucket = 0;
        for (; bucket < scoreThreshold.length && scoreThreshold[bucket] <= score; bucket++)
        float pagerankBoost = bucket - 1;
        if (bucket < scoreThreshold.length) {
            float bucketSpan = (scoreThreshold[bucket] - scoreThreshold[bucket - 1]);
            if (bucketSpan > 0) {
                pagerankBoost += (score - scoreThreshold[bucket - 1]) / bucketSpan;
            if (pagerankBoost < 0.1f)
                pagerankBoost = 0.1f;
        return pagerankBoost;

    // Calcuate the spamrank boost, range 0.01 - 1.
    private float calculateSpamrankBoost(Page page) {
        // spam rank 
        float spamrank = 1f;
        if (antiScoreThreshold[0] < antiScoreThreshold[antiScoreThreshold.length - 1]) {
            float score = page.getAntiScore();
            int bucket = 0;
            for (; bucket < antiScoreThreshold.length && antiScoreThreshold[bucket] <= score; bucket++)
            spamrank = bucket - 1;
            if (bucket < antiScoreThreshold.length) {
                float bucketSpan = (antiScoreThreshold[bucket] - antiScoreThreshold[bucket - 1]);
                if (bucketSpan > 0) {
                    spamrank += (score - antiScoreThreshold[bucket - 1]) / bucketSpan;
            spamrank = (10f - spamrank) / 10f;
            if (spamrank < 0.01f)
                spamrank = 0.01f;
        return spamrank;

    // Calcuate the log(inlinks) boost, range 0.1 - 10 for up to 20000 inlinks
    private float calculateLogBoost(Page page) {
        // log(inlinks) boost 
        float logBoost = 0.1f;
        int inlinks = page.getNumInlinks();
        if (inlinks > 1) {
            logBoost = (float) Math.log(inlinks);
        return logBoost;

    // Calcuate the freshness boost. TODO: range?
    private float calculateFreshnessBoost(Page page) {
        final long MILLIS_IN_A_DAY = 24 * 60 * 60 * 1000L;
        long daysSinceLastChange = (System.currentTimeMillis() - page.getLastChange()) / MILLIS_IN_A_DAY;
        if (0 == daysSinceLastChange)
            daysSinceLastChange = 1;
        return 1f + (freshnessWeight * freshnessDamp) / (daysSinceLastChange + freshnessDamp);

    // Convert from a weight value in the [0.0, 1.0] range to a damp value in the corresponding [10, 0] range.
    // This is a convenience so that the user can think in terms of weight (0 = no weight at all, 1 = full weight),
    // but the program needs the damp factor (0 = no damp, 10 = full damp). 
    // @see factor()
    private int weightToDamp(float weight) {
        return (int) (10.0 * (1.0f - weight));

    // Damp the influence of a value in the boost formula.
    // If damp == 0, the value is returned unchanged, so the value retains its full influence.
    // If damp == 10, the value is disregarded and 1.0 is returned, so the value has no influence at all.
    // If damp is in the [1,9] range, the damp-power-of-two root of the value is returned, which gets closer to 1.0 as damp increases.
    private float factor(String name, float value, int damp) {
        if (value < 0.01f || value > 15f) {
            logger.warn(name + " boost value out of range! (" + value + ")");
            value = (value < 0.01f) ? 0.01f : 15f;
        if (damp >= 10)
            return 1.0f;
        for (int i = 0; i < damp; i++) {
            value = (float) Math.sqrt(value);
        return value;

     * Polymorphic method for deciding how to compose a documentId 
     * from a page
     * @param page
     * @return 
    protected String getDocumentId(Page page) {
        return page.getUrl();

    // Add a page to the index
    protected void addToIndex(FetchDocument doc) {

        byte[] content = doc.getContent();
        if (0 == content.length) {
            logger.warn("Page has no data. Ignoring this document.");

        Set<String> categories = doc.getCategories();
        Map<String, Object> attributes = doc.getAttributes();
        Map<String, Object> indexableAttributes = doc.getIndexableAttributes();

        // build xml doc
        org.dom4j.Document dom = DocumentHelper.createDocument();
        Element root = dom.addElement("documentAdd");
        Page page = doc.getPage();
        String text = doc.getText();
        String url = page.getUrl();
        String host = getHost(url);
        String title = doc.getTitle(titleLengthLimit);
        String tokenizedHost = tokenizeHost(host);
        String anchorText = getAnchorText(page);

        float categoryBoost = calculateCategoryBoost(attributes);
        float pagerankBoost = calculatePagerankBoost(page);
        float spamrankBoost = calculateSpamrankBoost(page);
        float logBoost = calculateLogBoost(page);
        float freshnessBoost = calculateFreshnessBoost(page);

        // add overall score
        float f1 = factor("category", categoryBoost, categoryBoostDamp);
        float f2 = factor("pagerank", pagerankBoost, pagerankBoostDamp);
        float f3 = factor("spamrank", spamrankBoost, spamrankBoostDamp);
        float f4 = factor("log", logBoost, logBoostDamp);
        float f5 = factor("freshness", freshnessBoost, freshnessBoostDamp);
        float f6 = ((Double) attributes.get("boost")).floatValue(); // as calculated by the boost module, or 1.0 if no boost module is defined.
        float boost = f1 * f2 * f3 * f4 * f5 * f6;

        // System.out.println("BOOST url=["+url+"]  category="+f1+" ("+categoryBoost+":"+categoryBoostDamp+")  pagerank="+f2+" ("+pagerankBoost+":"+pagerankBoostDamp+")  log="+f3+" ("+logBoost+":"+logBoostDamp+")  freshness="+f4+" ("+freshnessBoost+":"+freshnessBoostDamp+") moduleBoost="+f5+"  Boost="+boost);

        if (boost < 1e-6f) {
            logger.warn("Boost too low! (" + boost + ")  category=" + f1 + " (" + categoryBoost + ":"
                    + categoryBoostDamp + ")  pagerank=" + f2 + " (" + pagerankBoost + ":" + pagerankBoostDamp
                    + ")  spamrank=" + f3 + " (" + spamrankBoost + ":" + spamrankBoostDamp + ")  log=" + f4 + " ("
                    + logBoost + ":" + logBoostDamp + ")  freshness=" + f5 + " (" + freshnessBoost + ":"
                    + freshnessBoostDamp + ") moduleBoost=" + f6);
            boost = 1e-6f;

        if (null == title || "".equals(title)) {
            title = "Untitled";


        Map<String, Double> boostMap = (Map<String, Double>) attributes.get("field_boost");

        // add the search fields
        addField(root, "url", url, true, true, true, boostMap);
        addField(root, "site", host, true, true, false, boostMap);
        addField(root, "tokenizedHost", tokenizedHost, false, true, true, boostMap);
        addField(root, "title", title, true, true, true, boostMap);
        addField(root, "text", text, true, true, true, boostMap);
        addField(root, "anchor", anchorText, false, true, true, boostMap);
        addField(root, "crawl", crawlName, false, true, true, boostMap);

        if (sendContent) {
            addBody(root, doc, content);

        // for debugging only
        //addField(root, "boostinfo", "category="+f1+" ("+categoryBoost+":"+categoryBoostDamp+")  pagerank="+f2+" ("+pagerankBoost+":"+pagerankBoostDamp+")  log="+f3+" ("+logBoost+":"+logBoostDamp+")  freshness="+f4+" ("+freshnessBoost+":"+freshnessBoostDamp+") moduleBoost="+f5+"  Boost="+boost, true, false, false, null);

        addAdditionalFields(root, page, boostMap);

        // Adding metainfo from attributes
        Set<Entry<String, Object>> attributeSet = indexableAttributes.entrySet();
        for (Entry<String, Object> attribute : attributeSet) {
            addField(root, attribute.getKey(), attribute.getValue() == null ? "" : attribute.getValue().toString(),
                    true, true, true, boostMap);

        StringBuffer assignedCategories = new StringBuffer();
        if (null != categories) {
            // iterate through the classes the page belongs to add each category and its score
            for (Iterator<String> iter = categories.iterator(); iter.hasNext();) {
                assignedCategories.append(" ");

                // repeat the field times proportional to the score (this is a way to boost the document by category);
                //for (int rep = 0; rep < score*10; rep++) {
                //    addField(root, "categoryBoost", categ, false, true, false);
            addField(root, "categories", assignedCategories.toString().trim(), true, true, true, boostMap);

        if (logger.isDebugEnabled()) {
            logger.debug("Indexing dom: " + DomUtil.domToString(dom));
        // Send the document to the indexer. If the queue is full, wait and retry.
        try {
            int i = pageMapper.mapPage(page);
            while (indexers[i].index(dom) == IndexerReturnCode.RETRY_QUEUE_FULL) {
                try {
                    Thread.sleep(indexerBusyRetryTime * 1000);
                } catch (InterruptedException e) {
                    logger.debug("Sleep interrupted: " + e, e);
        } catch (Exception e) {
            logger.error(e, e);

     * Intended for extension.
     * Any subclass of IndexerModule should override this method to add any additional field it needs.
    protected void addAdditionalFields(Element root, Page page, Map<String, Double> boostMap) {
        //Intended for extension.

     * Adds a new field to the <code>doc</code> Element. 
     * @param doc the element to add the field to
     * @param name the name of the field
     * @param value the String value for the field
     * @param stored true iif should be stored
     * @param indexed true iif should be indexed
     * @param tokenized true iif should be tokenized
     * @param boostMap map containing the boosts for each field name
    protected final void addField(Element doc, String name, String value, boolean stored, boolean indexed,
            boolean tokenized, Map<String, Double> boostMap) {
        Double boost = 1.0d;
        if (null != boostMap && boostMap.containsKey(name)) {
            boost = boostMap.get(name);
        doc.addElement("field").addAttribute("name", name).addAttribute("stored", Boolean.toString(stored))
                .addAttribute("indexed", Boolean.toString(indexed))
                .addAttribute("tokenized", Boolean.toString(tokenized)).addAttribute("boost", boost.toString())

    protected final void addBody(Element doc, FetchDocument fetchDoc, byte[] bytes) {
        String encoding = null;
        // find charset. http headers usually have a Content-Type line, but
        // as it may not be in the same case, all headers are stored lowercased.
        // Content-Type lines contain mime-type and charset, separated by ;
        // for example
        // Content-Type: text/html; charset=UTF-8
        if (fetchDoc.getHeader().containsKey("content-type")) {
            String[] tokens = fetchDoc.getHeader().get("content-type").split(";");
            for (String token : tokens) {
                if (token.toLowerCase().contains("charset") && token.contains("=")) {
                    encoding = token.split("=")[1].trim().toUpperCase();
        // if not found, use default encoding
        if (null == encoding) {
            encoding = java.nio.charset.Charset.defaultCharset().name();

        try {
            doc.addElement("body").addText(new String(bytes, encoding));
        } catch ( e) {
            logger.error("while adding body: ", e);

    // Extract the host part of the url
    private String getHost(String url) {
        String host;
        try {
            host = new URI(url).getHost();
            if (null == host) {
                host = "";
            if (0 == host.trim().length()) {
                logger.warn("Null or empty host (" + url + ")");
        } catch (URISyntaxException e) {
            logger.warn("Invalid url (" + url + ")");
            host = "";
        return host;

    // Separate a host name into its parts
    private String tokenizeHost(String host) {
        String tokenizedHost;
        if (0 == host.trim().length()) {
            tokenizedHost = "";
        } else {
            String[] hostParts = host.split("\\.");
            StringBuffer buf = new StringBuffer();

            // strip the common parts away
            for (String part : hostParts) {
                if (!hostStopWords.contains(part)) {
                    buf.append(" ");

            // add the normalized domain (sans subdomain)
            int keep = (hostParts[hostParts.length - 1].length() == 2) ? 3 : 2;
            keep = Math.min(keep, hostParts.length);
            for (int i = hostParts.length - keep; i < hostParts.length; i++) {
                if (i < hostParts.length - 1) {

            tokenizedHost = buf.toString();
        return tokenizedHost;

    // Return a string with all the anchors
    private String getAnchorText(Page page) {
        StringBuffer anchorText = new StringBuffer();
        String[] anchors = page.getAnchors();
        for (int i = 0; i < anchors.length; i++) {
            anchorText.append(" ");
        return anchorText.toString();

    public void applyCommand(Object command) {
        if ("optimize".equals(command.toString())) {
  "optimize requested.");
            try {
                org.dom4j.Document dom = DocumentHelper.createDocument();
                dom.addElement("command").addAttribute("name", "optimize");
                for (int i = 0; i < indexers.length; i++) {
            } catch (Exception e) {
                logger.error(e, e);
        } else if ("delete".equals(command.toString())) {
            FetchDocument doc = ((CommandWithDoc) command).getDoc();
            Page page = doc.getPage();
        } else if ("startCycle".equals(command.toString())) {
            PageDB pagedb = ((CommandWithPageDB) command).getPageDB();
            scoreThreshold = new float[11];
            for (int i = 0; i < 11; i++) {
                scoreThreshold[i] = pagedb.getScoreThreshold(i * 10);
            antiScoreThreshold = new float[11];
            for (int i = 0; i < 11; i++) {
                antiScoreThreshold[i] = pagedb.getAntiScoreThreshold(i * 10);
