org.carrot2.examples.clustering.UsingCustomLexicalResources.java Source code

Java tutorial

Introduction

Here is the source code for org.carrot2.examples.clustering.UsingCustomLexicalResources.java

Source

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2014, Dawid Weiss, Stanisaw Osiski.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.examples.clustering;

import java.io.File;
import java.util.Map;

import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
import org.carrot2.clustering.stc.STCClusteringAlgorithm;
import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.IDocumentSource;
import org.carrot2.core.ProcessingResult;
import org.carrot2.core.attribute.CommonAttributesDescriptor;
import org.carrot2.examples.ConsoleFormatter;
import org.carrot2.examples.SampleDocumentData;
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
import org.carrot2.text.linguistic.LexicalDataLoaderDescriptor;
import org.carrot2.util.resource.DirLocator;
import org.carrot2.util.resource.ResourceLookup;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

/**
 * This example shows how to configure the location of lexical resources to be something
 * else than the default (by default lexical resources are read using the context class
 * loader).
 */
public class UsingCustomLexicalResources {
    public static void main(String[] args) {
        @SuppressWarnings("unchecked")
        final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

        // We will pass our custom resource locator at initialization time. There is a
        // variety of implementations of IResourceLocator interface, we will use
        // an explicit filesystem folder in the current working directory.
        File resourcesDir = new File("resources");
        ResourceLookup resourceLookup = new ResourceLookup(new DirLocator(resourcesDir));

        Map<String, Object> attrs = Maps.newHashMap();

        // Note that we tell the linguistic component to merge all lexical resources,
        // this is the default setting and it usually helps with multi-lingual content.
        DefaultLexicalDataFactoryDescriptor.attributeBuilder(attrs).mergeResources(true);
        LexicalDataLoaderDescriptor.attributeBuilder(attrs).resourceLookup(resourceLookup);

        controller.init(attrs);

        // Cluster some data with Lingo and STC.
        clusterAndDisplayClusters(controller, LingoClusteringAlgorithm.class);
        clusterAndDisplayClusters(controller, STCClusteringAlgorithm.class);
    }

    /**
     * Clusters results for query "data mining" and displays the clusters.
     */
    private static void clusterAndDisplayClusters(final Controller controller,
            final Class<? extends IClusteringAlgorithm> clusteringAlgorithm) {
        final Map<String, Object> processingAttributes = Maps.newHashMap();

        CommonAttributesDescriptor.attributeBuilder(processingAttributes)
                .documents(Lists.newArrayList(SampleDocumentData.DOCUMENTS_DATA_MINING)).query("data mining");

        final ProcessingResult result = controller.process(processingAttributes, clusteringAlgorithm);
        ConsoleFormatter.displayClusters(result.getClusters(), 0);
    }
}