org.apache.nutch.searcher.Query.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.searcher.Query.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.searcher;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.ArrayList;

// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.nutch.analysis.AnalyzerFactory;
import org.apache.nutch.analysis.ParseException;

import org.apache.nutch.analysis.NutchAnalysis;
import org.apache.nutch.util.NutchConfiguration;

/** A Nutch query. */
public final class Query implements Writable, Cloneable, Configurable {
    public static final Log LOG = LogFactory.getLog(Query.class);

    private QueryParams params = new QueryParams();

    public void setParams(QueryParams context) {
        this.params = context;
    }

    public QueryParams getParams() {
        return params;
    }

    /** A query clause. */
    public static class Clause implements Cloneable {
        public static final String DEFAULT_FIELD = "DEFAULT";

        private static final byte REQUIRED_BIT = 1;
        private static final byte PROHIBITED_BIT = 2;
        private static final byte PHRASE_BIT = 4;

        private boolean isRequired;
        private boolean isProhibited;
        private String field = DEFAULT_FIELD;
        private float weight = 1.0f;
        private Object termOrPhrase;

        private Configuration conf;

        public Clause(Term term, String field, boolean isRequired, boolean isProhibited, Configuration conf) {
            this(term, isRequired, isProhibited, conf);
            this.field = field;
        }

        public Clause(Term term, boolean isRequired, boolean isProhibited, Configuration conf) {
            this.isRequired = isRequired;
            this.isProhibited = isProhibited;
            this.termOrPhrase = term;
            this.conf = conf;
        }

        public Clause(Phrase phrase, String field, boolean isRequired, boolean isProhibited, Configuration conf) {
            this(phrase, isRequired, isProhibited, conf);
            this.field = field;
        }

        public Clause(Phrase phrase, boolean isRequired, boolean isProhibited, Configuration conf) {
            this.isRequired = isRequired;
            this.isProhibited = isProhibited;
            this.termOrPhrase = phrase;
            this.conf = conf;
        }

        public boolean isRequired() {
            return isRequired;
        }

        public boolean isProhibited() {
            return isProhibited;
        }

        public String getField() {
            return field;
        }

        public float getWeight() {
            return weight;
        }

        public void setWeight(float weight) {
            this.weight = weight;
        }

        public boolean isPhrase() {
            return termOrPhrase instanceof Phrase;
        }

        public Phrase getPhrase() {
            return (Phrase) termOrPhrase;
        }

        public Term getTerm() {
            return (Term) termOrPhrase;
        }

        public void write(DataOutput out) throws IOException {
            byte bits = 0;
            if (isPhrase())
                bits |= PHRASE_BIT;
            if (isRequired)
                bits |= REQUIRED_BIT;
            if (isProhibited)
                bits |= PROHIBITED_BIT;
            out.writeByte(bits);
            out.writeUTF(field);
            out.writeFloat(weight);

            if (isPhrase())
                getPhrase().write(out);
            else
                getTerm().write(out);
        }

        public static Clause read(DataInput in, Configuration conf) throws IOException {
            byte bits = in.readByte();
            boolean required = ((bits & REQUIRED_BIT) != 0);
            boolean prohibited = ((bits & PROHIBITED_BIT) != 0);

            String field = in.readUTF();
            float weight = in.readFloat();

            Clause clause;
            if ((bits & PHRASE_BIT) == 0) {
                clause = new Clause(Term.read(in), field, required, prohibited, conf);
            } else {
                clause = new Clause(Phrase.read(in), field, required, prohibited, conf);
            }
            clause.weight = weight;
            return clause;
        }

        public String toString() {
            StringBuffer buffer = new StringBuffer();
            // if (isRequired)
            // buffer.append("+");
            // else
            if (isProhibited)
                buffer.append("-");

            if (!DEFAULT_FIELD.equals(field)) {
                buffer.append(field);
                buffer.append(":");
            }

            if (!isPhrase() && new QueryFilters(conf).isRawField(field)) {
                buffer.append('"'); // quote raw terms
                buffer.append(termOrPhrase.toString());
                buffer.append('"');
            } else {
                buffer.append(termOrPhrase.toString());
            }

            return buffer.toString();
        }

        public boolean equals(Object o) {
            if (!(o instanceof Clause))
                return false;
            Clause other = (Clause) o;
            return (this.isRequired == other.isRequired) && (this.isProhibited == other.isProhibited)
                    && (this.weight == other.weight) && (this.termOrPhrase == null ? other.termOrPhrase == null
                            : this.termOrPhrase.equals(other.termOrPhrase));
        }

        public int hashCode() {
            return (this.isRequired ? 0 : 1) ^ (this.isProhibited ? 2 : 4) ^ Float.floatToIntBits(this.weight)
                    ^ (this.termOrPhrase != null ? termOrPhrase.hashCode() : 0);
        }

        public Object clone() {
            try {
                return super.clone();
            } catch (CloneNotSupportedException e) {
                throw new RuntimeException(e);
            }
        }
    }

    /** A single-term query clause. */
    public static class Term {
        private String text;

        public Term(String text) {
            this.text = text;
        }

        public void write(DataOutput out) throws IOException {
            out.writeUTF(text);
        }

        public static Term read(DataInput in) throws IOException {
            String text = in.readUTF();
            return new Term(text);
        }

        public String toString() {
            return text;
        }

        public boolean equals(Object o) {
            if (!(o instanceof Term))
                return false;
            Term other = (Term) o;
            return text == null ? other.text == null : text.equals(other.text);
        }

        public int hashCode() {
            return text != null ? text.hashCode() : 0;
        }
    }

    /** A phrase query clause. */
    public static class Phrase {
        private Term[] terms;

        public Phrase(Term[] terms) {
            this.terms = terms;
        }

        public Phrase(String[] terms) {
            this.terms = new Term[terms.length];
            for (int i = 0; i < terms.length; i++) {
                this.terms[i] = new Term(terms[i]);
            }
        }

        public Term[] getTerms() {
            return terms;
        }

        public void write(DataOutput out) throws IOException {
            out.writeByte(terms.length);
            for (int i = 0; i < terms.length; i++)
                terms[i].write(out);
        }

        public static Phrase read(DataInput in) throws IOException {
            int length = in.readByte();
            Term[] terms = new Term[length];
            for (int i = 0; i < length; i++)
                terms[i] = Term.read(in);
            return new Phrase(terms);
        }

        public String toString() {
            StringBuffer buffer = new StringBuffer();
            buffer.append("\"");
            for (int i = 0; i < terms.length; i++) {
                buffer.append(terms[i].toString());
                if (i != terms.length - 1)
                    buffer.append(" ");
            }
            buffer.append("\"");
            return buffer.toString();
        }

        public boolean equals(Object o) {
            if (!(o instanceof Phrase))
                return false;
            Phrase other = (Phrase) o;
            if (!(this.terms.length == this.terms.length))
                return false;
            for (int i = 0; i < terms.length; i++) {
                if (!this.terms[i].equals(other.terms[i]))
                    return false;
            }
            return true;
        }

        public int hashCode() {
            int hashCode = terms.length;
            for (int i = 0; i < terms.length; i++) {
                hashCode ^= terms[i].hashCode();
            }
            return hashCode;
        }

    }

    private ArrayList<Clause> clauses = new ArrayList<Clause>();

    private Configuration conf;

    private static final Clause[] CLAUSES_PROTO = new Clause[0];

    public Query() {
    }

    public Query(Configuration conf) {
        this.conf = conf;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    public Configuration getConf() {
        return conf;
    }

    /** Return all clauses. */
    public Clause[] getClauses() {
        return clauses.toArray(CLAUSES_PROTO);
    }

    /** Add a required term in the default field. */
    public void addRequiredTerm(String term) {
        addRequiredTerm(term, Clause.DEFAULT_FIELD);
    }

    /** Add a required term in a specified field. */
    public void addRequiredTerm(String term, String field) {
        clauses.add(new Clause(new Term(term), field, true, false, this.conf));
    }

    /** Add a prohibited term in the default field. */
    public void addProhibitedTerm(String term) {
        addProhibitedTerm(term, Clause.DEFAULT_FIELD);
    }

    /** Add a prohibited term in the specified field. */
    public void addProhibitedTerm(String term, String field) {
        clauses.add(new Clause(new Term(term), field, false, true, this.conf));
    }

    /** Add a required phrase in the default field. */
    public void addRequiredPhrase(String[] terms) {
        addRequiredPhrase(terms, Clause.DEFAULT_FIELD);
    }

    /** Add a required phrase in the specified field. */
    public void addRequiredPhrase(String[] terms, String field) {
        if (terms.length == 0) { // ignore empty phrase
        } else if (terms.length == 1) {
            addRequiredTerm(terms[0], field); // optimize to term query
        } else {
            clauses.add(new Clause(new Phrase(terms), field, true, false, this.conf));
        }
    }

    /** Add a prohibited phrase in the default field. */
    public void addProhibitedPhrase(String[] terms) {
        addProhibitedPhrase(terms, Clause.DEFAULT_FIELD);
    }

    /** Add a prohibited phrase in the specified field. */
    public void addProhibitedPhrase(String[] terms, String field) {
        if (terms.length == 0) { // ignore empty phrase
        } else if (terms.length == 1) {
            addProhibitedTerm(terms[0], field); // optimize to term query
        } else {
            clauses.add(new Clause(new Phrase(terms), field, false, true, this.conf));
        }
    }

    public void write(DataOutput out) throws IOException {
        out.writeByte(clauses.size());
        for (int i = 0; i < clauses.size(); i++)
            clauses.get(i).write(out);
        params.write(out);
    }

    public static Query read(DataInput in, Configuration conf) throws IOException {
        Query result = new Query(conf);
        result.readFields(in);
        return result;
    }

    public void readFields(DataInput in) throws IOException {
        clauses.clear();
        int length = in.readByte();
        for (int i = 0; i < length; i++)
            clauses.add(Clause.read(in, this.conf));

        params.readFields(in);
    }

    public String toString() {
        StringBuffer buffer = new StringBuffer();
        for (int i = 0; i < clauses.size(); i++) {
            buffer.append(clauses.get(i).toString());
            if (i != clauses.size() - 1)
                buffer.append(" ");
        }
        return buffer.toString();
    }

    public boolean equals(Object o) {
        if (!(o instanceof Query))
            return false;
        Query other = (Query) o;
        return this.clauses.equals(other.clauses) && this.params.equals(other.params);
    }

    public int hashCode() {
        return this.clauses.hashCode();
    }

    public Object clone() {
        Query clone = null;
        try {
            clone = (Query) super.clone();
        } catch (CloneNotSupportedException e) {
            throw new RuntimeException(e);
        }
        clone.clauses = (ArrayList<Clause>) clauses.clone();
        return clone;
    }

    /**
     * Flattens a query into the set of text terms that it contains. These are
     * terms which should be higlighted in matching documents.
     */
    public String[] getTerms() {
        ArrayList<String> result = new ArrayList<String>();
        for (int i = 0; i < clauses.size(); i++) {
            Clause clause = clauses.get(i);
            if (!clause.isProhibited()) {
                if (clause.isPhrase()) {
                    Term[] terms = clause.getPhrase().getTerms();
                    for (int j = 0; j < terms.length; j++) {
                        result.add(terms[j].toString());
                    }
                } else {
                    result.add(clause.getTerm().toString());
                }
            }
        }
        return result.toArray(new String[result.size()]);
    }

    /**
     * Parse a query from a string using a language specific analyzer.
     * 
     * @param queryString
     *            is the raw query string to parse
     * @param queryLang
     *            is a two-letters language code used to identify which
     *            {@link org.apache.nutch.analysis.NutchAnalyzer} should be used
     *            to parse the query string.
     * @see org.apache.nutch.analysis.AnalyzerFactory
     */
    public static Query parse(String queryString, String queryLang, Configuration conf) throws IOException {
        try {
            return fixup(NutchAnalysis.parseQuery(queryString, AnalyzerFactory.get(conf).get(queryLang), conf),
                    conf);
        } catch (ParseException e) {
            e.printStackTrace();
        }
        return null;
    }

    /** Parse a query from a string. */
    public static Query parse(String queryString, Configuration conf) throws IOException {
        return parse(queryString, null, conf);
    }

    /** Convert clauses in unknown fields to the default field. */
    private static Query fixup(Query input, Configuration conf) {
        // walk the query
        Query output = new Query(conf);
        Clause[] clauses = input.getClauses();
        for (int i = 0; i < clauses.length; i++) {
            Clause c = clauses[i];
            if (!new QueryFilters(conf).isField(c.getField())) { // unknown
                // field
                ArrayList<Term> terms = new ArrayList<Term>(); // add name to
                // query
                if (c.isPhrase()) {
                    terms.addAll(Arrays.asList(c.getPhrase().getTerms()));
                } else {
                    terms.add(c.getTerm());
                }
                terms.add(0, new Term(c.getField())); // add to front of
                // phrase
                c = (Clause) c.clone();
                c.field = Clause.DEFAULT_FIELD; // use default field instead
                c.termOrPhrase = new Phrase(terms.toArray(new Term[terms.size()]));
            }
            output.clauses.add(c); // copy clause to output
        }
        return output;
    }

    /** For debugging. */
    public static void main(String[] args) throws Exception {
        BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
        Configuration conf = NutchConfiguration.create();
        while (true) {
            System.out.print("Query: ");
            String line = in.readLine();
            Query query = parse(line, conf);
            System.out.println("Parsed: " + query);
            System.out.println("Translated: " + new QueryFilters(conf).filter(query));
        }
    }
}