Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.clustering.carrot2; import java.util.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.clustering.HitsCluster; import org.apache.nutch.clustering.OnlineClusterer; import org.apache.nutch.searcher.HitDetails; import com.dawidweiss.carrot.core.local.*; import com.dawidweiss.carrot.core.local.clustering.RawCluster; import com.dawidweiss.carrot.core.local.impl.ClustersConsumerOutputComponent; import com.dawidweiss.carrot.core.local.linguistic.Language; import com.dawidweiss.carrot.util.tokenizer.languages.AllKnownLanguages; import com.stachoodev.carrot.filter.lingo.local.LingoLocalFilterComponent; /** * An plugin providing an implementation of {@link OnlineClusterer} * extension using clustering components of the Carrot2 project * (<a href="http://carrot2.sourceforge.net">http://carrot2.sourceforge.net</a>). * * We hardcode the following Carrot2 process: * <pre><![CDATA[ * <local-process id="yahoo-lingo"> * <name>Yahoo Search API -- Lingo Classic Clusterer</name> * * <input component-key="input-localnutch" /> * <filter component-key="filter-lingo" /> * <output component-key="output-clustersConsumer" /> * </local-process> * ]]></pre> * * @author Dawid Weiss * @version $Id: Clusterer.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $ */ public class Clusterer implements OnlineClusterer, Configurable { /** Default language property name. */ private final static String CONF_PROP_DEFAULT_LANGUAGE = "extension.clustering.carrot2.defaultLanguage"; /** Recognizable languages property name. */ private final static String CONF_PROP_LANGUAGES = "extension.clustering.carrot2.languages"; /** Internal clustering process ID in Carrot2 LocalController */ private final static String PROCESS_ID = "nutch-lingo"; public static final Log logger = LogFactory.getLog(Clusterer.class); /** The LocalController instance used for clustering */ private LocalController controller; /** Nutch configuration. */ private Configuration conf; /** * Default language for hits. English by default, but may be changed * via a property in Nutch configuration. */ private String defaultLanguage = "en"; /** * A list of recognizable languages.. * English only by default, but configurable via Nutch configuration. */ private String[] languages = new String[] { defaultLanguage }; /** * An empty public constructor for making new instances * of the clusterer. */ public Clusterer() { initialize(); } private synchronized void initialize() { controller = new LocalControllerBase(); addComponentFactories(); addProcesses(); } /** Adds the required component factories to a local Carrot2 controller. */ private void addComponentFactories() { // * <input component-key="input-localnutch" /> LocalComponentFactory nutchInputFactory = new LocalComponentFactoryBase() { public LocalComponent getInstance() { return new LocalNutchInputComponent(defaultLanguage); } }; controller.addLocalComponentFactory("input-localnutch", nutchInputFactory); // * <filter component-key="filter-lingo" /> LocalComponentFactory lingoFactory = new LocalComponentFactoryBase() { public LocalComponent getInstance() { HashMap defaults = new HashMap(); // These are adjustments settings for the clustering algorithm. // If you try the live WebStart demo of Carrot2 you can see how they affect // the final clustering: http://www.carrot2.org/webstart defaults.put("lsi.threshold.clusterAssignment", "0.150"); defaults.put("lsi.threshold.candidateCluster", "0.775"); // Initialize a new Lingo clustering component. ArrayList languageList = new ArrayList(languages.length); for (int i = 0; i < languages.length; i++) { final String lcode = languages[i]; try { Language lang = AllKnownLanguages.getLanguageForIsoCode(lcode); if (lang == null) { if (logger.isWarnEnabled()) { logger.warn("Language not supported in Carrot2: " + lcode); } } else { languageList.add(lang); if (logger.isDebugEnabled()) { logger.debug("Language loaded: " + lcode); } } } catch (Throwable t) { if (logger.isWarnEnabled()) { logger.warn("Language could not be loaded: " + lcode, t); } } } return new LingoLocalFilterComponent( (Language[]) languageList.toArray(new Language[languageList.size()]), defaults); } }; controller.addLocalComponentFactory("filter-lingo", lingoFactory); // * <output component-key="output-clustersConsumer" /> LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactoryBase() { public LocalComponent getInstance() { return new ClustersConsumerOutputComponent(); } }; controller.addLocalComponentFactory("output-clustersConsumer", clusterConsumerOutputFactory); } /** * Adds a hardcoded clustering process to the local controller. */ private void addProcesses() { LocalProcessBase process = new LocalProcessBase("input-localnutch", // input "output-clustersConsumer", // output new String[] { "filter-lingo" }, // filters "The Lingo clustering algorithm (www.carrot2.org).", ""); try { controller.addProcess(PROCESS_ID, process); } catch (Exception e) { throw new RuntimeException("Could not assemble clustering process.", e); } } /** * See {@link OnlineClusterer} for documentation. */ public HitsCluster[] clusterHits(HitDetails[] hitDetails, String[] descriptions) { Map requestParams = new HashMap(); requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_HIT_DETAILS_ARRAY, hitDetails); requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_SUMMARIES_ARRAY, descriptions); try { // The input component takes Nutch's results so we don't need the query argument. final ProcessingResult result = controller.query(PROCESS_ID, "no-query", requestParams); final ClustersConsumerOutputComponent.Result output = (ClustersConsumerOutputComponent.Result) result .getQueryResult(); final List outputClusters = output.clusters; final HitsCluster[] clusters = new HitsCluster[outputClusters.size()]; int j = 0; for (Iterator i = outputClusters.iterator(); i.hasNext(); j++) { RawCluster rcluster = (RawCluster) i.next(); clusters[j] = new HitsClusterAdapter(rcluster, hitDetails); } // invoke Carrot2 process here. return clusters; } catch (MissingProcessException e) { throw new RuntimeException("Missing clustering process.", e); } catch (Exception e) { throw new RuntimeException("Unidentified problems with the clustering.", e); } } /** * Implementation of {@link Configurable} */ public void setConf(Configuration conf) { this.conf = conf; // Configure default language and other component settings. if (conf.get(CONF_PROP_DEFAULT_LANGUAGE) != null) { // Change the default language. this.defaultLanguage = conf.get(CONF_PROP_DEFAULT_LANGUAGE); } if (conf.getStrings(CONF_PROP_LANGUAGES) != null) { this.languages = conf.getStrings(CONF_PROP_LANGUAGES); } if (logger.isInfoEnabled()) { logger.info("Default language: " + defaultLanguage); logger.info("Enabled languages: " + Arrays.asList(languages)); } initialize(); } /** * Implementation of {@link Configurable} */ public Configuration getConf() { return conf; } }