Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.keymatch; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.List; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.nutch.searcher.Query; import org.apache.nutch.util.DomUtil; import org.apache.xerces.dom.DocumentImpl; import org.w3c.dom.Element; import org.w3c.dom.NodeList; /** * <p>SimpleKeyMatcher is responsible for targetting predefined links for defined * keywords for example to promote some urls that are not yet part of * production index.</p> * <p>SimpleKeyMatcher is not a textadd targetting system</p> * <p>KeyMatcher is configured with xml configuration file: * <br><pre> * <?xml version="1.0"?> * <keymatches> * <keymatch type="keyword|phrase|exact"> * <term>search engine</term> * <url>http://lucene.apache.org/nutch</url> * <title>Your favourite search engine!</title> * </keymatch> * </keymatches></pre> * By default Keymatcher expects the file be named keymatches.xml * </p> * <p>Match type can be one of the following: keyword, phrase, exact match. * Terms of a query are produced by the Query object and none of the * matches is case sensitive</p> * <b>keyword</b><br> * Matches on keyword level, for example query "search engine" would match both * keywords search and engine<br> * <br> * <b>phrase</b><br> * Matches phrase, for example: query "open source search engine" "search engine watch" * would match "search engine", but query "search from engine" would not.<br> * <br> * <b>exact</b><br> * Query "open source search engine" would match "open source search engine", but not * "search engine" nor "best open source engine"<br> * */ public class SimpleKeyMatcher extends Configured { static final char PREFIX_KEYWORD = 'k'; static final char PREFIX_PHRASE = 'p'; static final char PREFIX_EXACT = 'e'; class KeyMatcherStats { int terms[]; void addStats(int numTerms) { if (numTerms <= terms.length) { terms[numTerms]++; } } public KeyMatcherStats(int size) { terms = new int[size]; for (int i = 0; i < size; i++) { terms[i] = 0; } } } public static final Log LOG = LogFactory.getLog(SimpleKeyMatcher.class); public static final String TAG_KEYMATCH = "keymatch"; public static final String TAG_KEYMATCHES = "keymatches"; static final String DEFAULT_CONFIG_FILE = "keymatches.xml"; static final int MAX_TERMS = 5; KeyMatcherStats stats; KeyMatchFilter currentFilter; HashMap matches = new HashMap(); private String configName; public SimpleKeyMatcher(Configuration conf) { this(DEFAULT_CONFIG_FILE, conf); } /** * Sets currentFilter * @param filter the filter to set */ public void setFilter(KeyMatchFilter filter) { this.currentFilter = filter; } /** * Construct new SimpleKeyMatcher with provided filename and configuration * @param resourceName * @param conf */ public SimpleKeyMatcher(String resourceName, Configuration conf) { super(conf); configName = resourceName; stats = new KeyMatcherStats(MAX_TERMS); currentFilter = new ViewCountSorter(); init(); } /** * Initialize keyword matcher * */ protected void init() { final HashMap tempMap = new HashMap(); final InputStream input = getConf().getConfResourceAsInputStream(configName); if (input != null) { final Element root = DomUtil.getDom(input); try { input.close(); } catch (IOException e1) { e1.printStackTrace(); } final NodeList nodeList = root.getElementsByTagName(TAG_KEYMATCH); LOG.debug("Configuration file has " + nodeList.getLength() + " KeyMatch entries."); for (int i = 0; i < nodeList.getLength(); i++) { final Element element = (Element) nodeList.item(i); final KeyMatch keyMatch = new KeyMatch(); keyMatch.initialize(element); addKeyMatch(tempMap, keyMatch); } matches = tempMap; } } /** * Get keymatches for query * @param query parsed query * @param context evaluation context * @return array of keymatches */ public KeyMatch[] getMatches(final Query query, Map context) { final ArrayList currentMatches = new ArrayList(); final String terms[] = query.getTerms(); //"keyword" for (int i = 0; i < terms.length; i++) { if (LOG.isDebugEnabled()) { LOG.debug("keyword: '" + terms[i] + "'"); } addMatches(currentMatches, matches.get(PREFIX_KEYWORD + terms[i])); } //"phrase" for (int l = 2; l <= terms.length; l++) { if (stats.terms[l] > 0) { for (int p = 0; p <= terms.length - l; p++) { String key = ""; for (int i = p; i < p + l; i++) { key += terms[i]; if (i != p + l - 1) key += " "; } if (LOG.isDebugEnabled()) { LOG.debug("phrase key: '" + key + "'"); } addMatches(currentMatches, matches.get(PREFIX_PHRASE + key)); } } } //"exact" String key = query.toString(); if (LOG.isDebugEnabled()) { LOG.debug("exact key: '" + key + "'"); } addMatches(currentMatches, matches.get(PREFIX_EXACT + key)); return currentFilter.filter(currentMatches, context); } void addMatches(ArrayList currentMatches, Object match) { if (match != null) { if (match instanceof ArrayList) { currentMatches.addAll(((ArrayList) match)); } else { currentMatches.add(match); } } } /** Get tokens of a string with nutch Query parser * * @param string * @return */ private String[] getTokens(final String string) { org.apache.nutch.searcher.Query q; try { q = org.apache.nutch.searcher.Query.parse(string, getConf()); return q.getTerms(); } catch (IOException e) { LOG.info("Error getting terms from query:" + e); } return new String[0]; } /** * add new keymatch * * @param keymatch */ protected void addKeyMatch(Map map, final KeyMatch keymatch) { String key = ""; LOG.info("Adding keymatch: MATCHTYPE=" + KeyMatch.TYPES[keymatch.type] + ", TERM='" + keymatch.term + "', TITLE='" + keymatch.title + "' ,URL='" + keymatch.url + "'"); keymatch.term = keymatch.term.toLowerCase(); switch (keymatch.type) { case KeyMatch.TYPE_EXACT: key += PREFIX_EXACT; break; case KeyMatch.TYPE_PHRASE: key += PREFIX_PHRASE; break; default: key += PREFIX_KEYWORD; break; } //add info obout kw count for optimization if (keymatch.type == KeyMatch.TYPE_PHRASE) { stats.addStats(getTokens(keymatch.term).length); } key += keymatch.term; if (map.containsKey(key)) { ArrayList l; Object o = matches.get(key); if (o instanceof ArrayList) { l = (ArrayList) o; } else { KeyMatch temp = (KeyMatch) o; l = new ArrayList(); l.add(temp); } l.add(keymatch); map.put(key, l); } else { map.put(key, keymatch); } } /** * Add Keymatch * */ public void addKeyMatch(KeyMatch match) { addKeyMatch(matches, match); } /** * Saves keymatch configuration into file. * * @throws IOException */ public void save() throws IOException { try { final URL url = getConf().getResource(configName); if (url == null) { throw new IOException("Resource not found: " + configName); } final FileOutputStream fos = new FileOutputStream(new File(url.getFile())); final DocumentImpl doc = new DocumentImpl(); final Element keymatches = doc.createElement(TAG_KEYMATCHES); final Iterator iterator = matches.values().iterator(); while (iterator.hasNext()) { final Element keymatch = doc.createElement(TAG_KEYMATCH); final KeyMatch keyMatch = (KeyMatch) iterator.next(); keyMatch.populateElement(keymatch); keymatches.appendChild(keymatch); } DomUtil.saveDom(fos, keymatches); fos.flush(); fos.close(); } catch (FileNotFoundException e) { throw new IOException(e.toString()); } } /** * Clear keymatches from this SimpleKeyMatcher instance * */ public void clear() { matches = new HashMap(); } public void setKeyMatches(List keymatches) { HashMap hm = new HashMap(); Iterator i = keymatches.iterator(); while (i.hasNext()) { KeyMatch km = (KeyMatch) i.next(); addKeyMatch(hm, km); } matches = hm; } }