pt.uminho.anote2.carrot.linkage.examples.MoreConfigurationsOfOneAlgorithmInCachingController.java Source code

Introduction

Here is the source code for pt.uminho.anote2.carrot.linkage.examples.MoreConfigurationsOfOneAlgorithmInCachingController.java
Source

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2012, Dawid Weiss, Stanisaw Osiski.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package pt.uminho.anote2.carrot.linkage.examples;

import java.util.HashMap;
import java.util.Map;

import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
import org.carrot2.clustering.lingo.LingoClusteringAlgorithmDescriptor;
import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.IDocumentSource;
import org.carrot2.core.ProcessingComponentConfiguration;
import org.carrot2.core.ProcessingResult;
import org.carrot2.core.attribute.CommonAttributesDescriptor;
import org.carrot2.matrix.factorization.IterationNumberGuesser.FactorizationQuality;
import org.carrot2.source.microsoft.Bing3WebDocumentSource;

import com.google.common.collect.Maps;

/**
 * It is possible to initialize a {@link Controller} to host a number of different
 * configurations of the same {@link IDocumentSource} or {@link IClusteringAlgorithm} and
 * invoke them as appropriate. This is achieved by assigning a string identifier to each
 * configuration and then passing the identifier to the
 * {@link Controller#process(Map, String...)} method.
 * <p>
 * One example where this setting may be useful is when your application serves multiple
 * customers, each of which need a different configuration of a document source or a
 * clustering algorithm.
 * </p>
 */
public class MoreConfigurationsOfOneAlgorithmInCachingController {
    @SuppressWarnings({ "unchecked" })
    public static void main(String[] args) {
        /*
         * Create a controller that caches all documents.
         */
        final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

        /*
         * You can define global values for some attributes. These will apply to all
         * configurations we will define below, unless the specific configuration
         * overrides the global attributes.
         */
        final Map<String, Object> globalAttributes = new HashMap<String, Object>();

        LingoClusteringAlgorithmDescriptor.attributeBuilder(globalAttributes).preprocessingPipeline()
                .documentAssigner().exactPhraseAssignment(false);

        /*
         * Now we will define two different configurations of the Lingo algorithm. One
         * will be optimized for speed of clustering, while the other will optimize the
         * quality of clusters.
         */
        final Map<String, Object> fastAttributes = Maps.newHashMap();
        LingoClusteringAlgorithmDescriptor.attributeBuilder(fastAttributes).desiredClusterCountBase(20)
                .matrixReducer().factorizationQuality(FactorizationQuality.LOW);

        LingoClusteringAlgorithmDescriptor.attributeBuilder(fastAttributes).preprocessingPipeline().caseNormalizer()
                .dfThreshold(2);

        final Map<String, Object> accurateAttributes = Maps.newHashMap();
        LingoClusteringAlgorithmDescriptor.attributeBuilder(accurateAttributes).desiredClusterCountBase(40)
                .matrixReducer().factorizationQuality(FactorizationQuality.HIGH);

        LingoClusteringAlgorithmDescriptor.attributeBuilder(accurateAttributes).preprocessingPipeline()
                .documentAssigner().exactPhraseAssignment(true);

        LingoClusteringAlgorithmDescriptor.attributeBuilder(fastAttributes).preprocessingPipeline().caseNormalizer()
                .dfThreshold(1);

        /*
         * We initialize the controller passing the global attributes and the two 
         * configurations. Notice that a configuration consists of the component
         * class (can be a document source as well as a clustering algorithm), its 
         * string identifier and attributes.
         */
        controller.init(globalAttributes,
                new ProcessingComponentConfiguration(LingoClusteringAlgorithm.class, "lingo-fast", fastAttributes),
                new ProcessingComponentConfiguration(LingoClusteringAlgorithm.class, "lingo-accurate",
                        accurateAttributes));

        /*
         * Now we can call the two different clustering configurations. Notice that 
         * because we now use string identifiers instead of classes, we pass the document
         * source class name rather than the class itself.
         */
        final Map<String, Object> attributes = new HashMap<String, Object>();
        CommonAttributesDescriptor.attributeBuilder(attributes).query("data mining");

        final ProcessingResult fastResult = controller.process(attributes, Bing3WebDocumentSource.class.getName(),
                "lingo-fast");
        ConsoleFormatter.displayClusters(fastResult.getClusters());

        final ProcessingResult accurateResult = controller.process(attributes,
                Bing3WebDocumentSource.class.getName(), "lingo-accurate");
        ConsoleFormatter.displayClusters(accurateResult.getClusters());
    }
}