org.carrot2.examples.clustering.UsingComponentSuites.java Source code

Java tutorial

Introduction

Here is the source code for org.carrot2.examples.clustering.UsingComponentSuites.java

Source

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2014, Dawid Weiss, Stanisaw Osiski.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.examples.clustering;

import java.io.File;
import java.util.List;
import java.util.Map;

import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.DocumentSourceDescriptor;
import org.carrot2.core.IDocumentSource;
import org.carrot2.core.ProcessingComponentDescriptor;
import org.carrot2.core.ProcessingComponentSuite;
import org.carrot2.core.ProcessingResult;
import org.carrot2.core.attribute.CommonAttributesDescriptor;
import org.carrot2.examples.ConsoleFormatter;
import org.carrot2.source.microsoft.Bing3WebDocumentSourceDescriptor;
import org.carrot2.util.resource.ContextClassLoaderLocator;
import org.carrot2.util.resource.DirLocator;
import org.carrot2.util.resource.IResource;
import org.carrot2.util.resource.ResourceLookup;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

/**
 * This example shows how to use 
 * <a href="http://download.carrot2.org/head/manual/#section.customizing.component-suites-and-attributes.component-suites">
 * Carrot<sup>2</sup> component suites</a>. Component suites come handy when you need to
 * manage a number of document sources or clustering algorithms, along with their
 * attributes and some metadata. such as human readable labels.
 */
public class UsingComponentSuites {
    public static void main(String[] args) throws Exception {
        @SuppressWarnings("unchecked")
        final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

        // Initialization-time attributes that will apply to all components.
        final Map<String, Object> initAttributes = Maps.newHashMap();

        // Prepare resource lookup facade. We will use the suites directory 
        // and class path resources.
        final ResourceLookup resourceLookup = new ResourceLookup(new DirLocator(new File("suites")),
                new ContextClassLoaderLocator());

        // We know we'll be using Bing so set up its access key.
        // use your own ID here!
        Bing3WebDocumentSourceDescriptor.attributeBuilder(initAttributes).appid(BingKeyAccess.getKey());

        // We'll read the component suite definition from an XML stream.
        // IResource is an abstraction layer over resources in Carrot2.
        IResource suiteXml = resourceLookup.getFirst("suite-examples.xml");

        // Deserialize the component suite definition.
        final ProcessingComponentSuite suite = ProcessingComponentSuite.deserialize(suiteXml, resourceLookup);

        // Initialize the controller with the suite. All components from the suite
        // will be available for processing within this controller.
        controller.init(initAttributes, suite.getComponentConfigurations());

        // From the suite definition, you can get the document sources and clustering
        // algorithm descriptors.
        final List<DocumentSourceDescriptor> sources = suite.getSources();
        final List<String> sourceIds = Lists.transform(sources,
                ProcessingComponentDescriptor.ProcessingComponentDescriptorToId.INSTANCE);
        System.out.println("Found " + sourceIds.size() + " document sources: " + sourceIds);

        final List<ProcessingComponentDescriptor> algorithms = suite.getAlgorithms();
        final List<String> algorithmIds = Lists.transform(algorithms,
                ProcessingComponentDescriptor.ProcessingComponentDescriptorToId.INSTANCE);
        System.out.println("Found " + algorithmIds.size() + " clutering algorithms: " + algorithmIds + "\n\n");

        // Run not more than two algorithms on not more than two sources
        for (int s = 0; s < Math.min(sourceIds.size(), 2); s++) {
            for (int a = 0; a < Math.min(algorithmIds.size(), 2); a++) {
                // You can retrieve some metadata about the components, such as
                // human-readable label, from their descriptors.
                System.out.println("Querying " + sources.get(s).getLabel() + ", clustering with "
                        + algorithms.get(a).getLabel());

                // As usual, we pass attributes for processing
                final Map<String, Object> attributes = Maps.newHashMap();
                CommonAttributesDescriptor.attributeBuilder(attributes).query("data mining");

                // Pass component ids to the controller to perform processing
                final ProcessingResult result = controller.process(attributes, sourceIds.get(s),
                        algorithmIds.get(a));
                ConsoleFormatter.displayClusters(result.getClusters());
                System.out.println();
            }
        }
    }
}