Java tutorial
/* * Copyright (C) 2011 Near Infinity Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.nearinfinity.blur.analysis; import static com.nearinfinity.blur.lucene.LuceneConstant.LUCENE_VERSION; import static com.nearinfinity.blur.utils.BlurConstants.PRIME_DOC; import static com.nearinfinity.blur.utils.BlurConstants.RECORD_ID; import static com.nearinfinity.blur.utils.BlurConstants.ROW_ID; import static com.nearinfinity.blur.utils.BlurConstants.SUPER; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.lang.reflect.Constructor; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.TreeSet; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.KeywordAnalyzer; import org.apache.lucene.analysis.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.util.Version; import org.apache.thrift.TException; import org.apache.thrift.protocol.TJSONProtocol; import org.apache.thrift.transport.TMemoryBuffer; import org.apache.thrift.transport.TMemoryInputTransport; import com.nearinfinity.blur.thrift.generated.AlternateColumnDefinition; import com.nearinfinity.blur.thrift.generated.AnalyzerDefinition; import com.nearinfinity.blur.thrift.generated.ColumnDefinition; import com.nearinfinity.blur.thrift.generated.ColumnFamilyDefinition; public class BlurAnalyzer extends Analyzer { private static final String STANDARD = "org.apache.lucene.analysis.standard.StandardAnalyzer"; public static final BlurAnalyzer BLANK_ANALYZER = new BlurAnalyzer(new KeywordAnalyzer()); private static Map<String, Class<? extends Analyzer>> aliases = new HashMap<String, Class<? extends Analyzer>>(); private Map<String, Store> _storeMap = new HashMap<String, Store>(); private Map<String, Set<String>> _subIndexNameLookups = new HashMap<String, Set<String>>(); private Set<String> _fullTextFields = new HashSet<String>(); private AnalyzerDefinition _analyzerDefinition; private PerFieldAnalyzerWrapper _wrapper; private Analyzer _fullTextAnalyzer = new StandardAnalyzer(LUCENE_VERSION); private HashMap<String, Analyzer> _analyzers; public void addSubField(String name) { int lastIndexOf = name.lastIndexOf('.'); String mainFieldName = name.substring(0, lastIndexOf); Set<String> set = _subIndexNameLookups.get(mainFieldName); if (set == null) { set = new TreeSet<String>(); _subIndexNameLookups.put(mainFieldName, set); } set.add(name); } public Set<String> getSubIndexNames(String indexName) { return _subIndexNameLookups.get(indexName); } public BlurAnalyzer(Analyzer analyzer) { _analyzerDefinition = new AnalyzerDefinition(); _wrapper = new PerFieldAnalyzerWrapper(analyzer); } public BlurAnalyzer(AnalyzerDefinition analyzerDefinition) { _analyzerDefinition = analyzerDefinition; ColumnDefinition defaultDefinition = analyzerDefinition.getDefaultDefinition(); String fullTextAnalyzerClassName = analyzerDefinition.fullTextAnalyzerClassName; if (fullTextAnalyzerClassName != null) { _fullTextAnalyzer = getAnalyzerByClassName(fullTextAnalyzerClassName, aliases); } if (defaultDefinition == null) { defaultDefinition = new ColumnDefinition(STANDARD, true, null); analyzerDefinition.setDefaultDefinition(defaultDefinition); } Analyzer defaultAnalyzer = getAnalyzerByClassName(defaultDefinition.getAnalyzerClassName(), aliases); KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer(); _analyzers = new HashMap<String, Analyzer>(); _analyzers.put(ROW_ID, keywordAnalyzer); _analyzers.put(RECORD_ID, keywordAnalyzer); _analyzers.put(PRIME_DOC, keywordAnalyzer); _analyzers.put(SUPER, _fullTextAnalyzer); load(_analyzers); _wrapper = new PerFieldAnalyzerWrapper(defaultAnalyzer, _analyzers); } public FieldConverter getFieldConverter(String name) { if (_analyzers == null) { return null; } Analyzer analyzer = _analyzers.get(name); if (analyzer != null && analyzer instanceof FieldConverter) { return (FieldConverter) analyzer; } return null; } private void load(Map<String, Analyzer> analyzers) { Map<String, ColumnFamilyDefinition> familyDefinitions = _analyzerDefinition.columnFamilyDefinitions; if (familyDefinitions != null) { for (String family : familyDefinitions.keySet()) { ColumnFamilyDefinition familyDefinition = familyDefinitions.get(family); load(family, familyDefinition, analyzers); } } } private void load(String family, ColumnFamilyDefinition familyDefinition, Map<String, Analyzer> analyzers) { Map<String, ColumnDefinition> columnDefinitions = familyDefinition.columnDefinitions; for (String column : columnDefinitions.keySet()) { ColumnDefinition columnDefinition = columnDefinitions.get(column); load(family, familyDefinition, column, columnDefinition, analyzers); } } private void load(String family, ColumnFamilyDefinition familyDefinition, String column, ColumnDefinition columnDefinition, Map<String, Analyzer> analyzers) { Map<String, AlternateColumnDefinition> alternateColumnDefinitions = columnDefinition.alternateColumnDefinitions; if (alternateColumnDefinitions != null) { for (String subColumn : alternateColumnDefinitions.keySet()) { AlternateColumnDefinition alternateColumnDefinition = alternateColumnDefinitions.get(subColumn); load(family, familyDefinition, column, columnDefinition, subColumn, alternateColumnDefinition, analyzers); } } String fieldName = family + "." + column; Analyzer analyzer = getAnalyzerByClassName(columnDefinition.getAnalyzerClassName(), aliases); analyzers.put(fieldName, analyzer); if (columnDefinition.isFullTextIndex()) { _fullTextFields.add(fieldName); } } private void load(String family, ColumnFamilyDefinition familyDefinition, String column, ColumnDefinition columnDefinition, String subColumn, AlternateColumnDefinition alternateColumnDefinition, Map<String, Analyzer> analyzers) { String fieldName = family + "." + column + "." + subColumn; Analyzer analyzer = getAnalyzerByClassName(alternateColumnDefinition.getAnalyzerClassName(), aliases); analyzers.put(fieldName, analyzer); putStore(fieldName, Store.NO); addSubField(fieldName); } public void putStore(String name, Store store) { _storeMap.put(name, store); } @SuppressWarnings("unchecked") private static Analyzer getAnalyzerByClassName(String className, Map<String, Class<? extends Analyzer>> aliases) { if (FieldConverterUtil.isType(className)) { return FieldConverterUtil.getAnalyzer(className); } try { Class<? extends Analyzer> clazz = aliases.get(className); if (clazz == null) { clazz = (Class<? extends Analyzer>) Class.forName(className); } try { return (Analyzer) clazz.newInstance(); } catch (Exception e) { Constructor<?> constructor = clazz.getConstructor(new Class[] { Version.class }); return (Analyzer) constructor.newInstance(LUCENE_VERSION); } } catch (Exception e) { throw new RuntimeException(e); } } public Store getStore(String indexName) { Store store = _storeMap.get(indexName); if (store == null) { return Store.YES; } return store; } public Index getIndex(String indexName) { return Index.ANALYZED_NO_NORMS; } public String toJSON() { TMemoryBuffer trans = new TMemoryBuffer(1024); TJSONProtocol protocol = new TJSONProtocol(trans); try { _analyzerDefinition.write(protocol); } catch (TException e) { throw new RuntimeException(e); } trans.close(); byte[] array = trans.getArray(); return new String(array, 0, trans.length()); } public boolean isFullTextField(String fieldName) { ColumnDefinition defaultDefinition = _analyzerDefinition.getDefaultDefinition(); if (defaultDefinition != null && defaultDefinition.fullTextIndex) { return true; } return _fullTextFields.contains(fieldName); } public AnalyzerDefinition getAnalyzerDefinition() { return _analyzerDefinition; } public void close() { _wrapper.close(); } public int getOffsetGap(Fieldable field) { return _wrapper.getOffsetGap(field); } public int getPositionIncrementGap(String fieldName) { return _wrapper.getPositionIncrementGap(fieldName); } public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { return _wrapper.reusableTokenStream(fieldName, reader); } public final TokenStream tokenStream(String fieldName, Reader reader) { return _wrapper.tokenStream(fieldName, reader); } public static BlurAnalyzer create(File file) throws IOException { FileInputStream inputStream = new FileInputStream(file); try { return create(inputStream); } finally { inputStream.close(); } } public static BlurAnalyzer create(InputStream inputStream) throws IOException { TMemoryInputTransport trans = new TMemoryInputTransport(getBytes(inputStream)); TJSONProtocol protocol = new TJSONProtocol(trans); AnalyzerDefinition analyzerDefinition = new AnalyzerDefinition(); try { analyzerDefinition.read(protocol); } catch (TException e) { throw new RuntimeException(e); } trans.close(); return new BlurAnalyzer(analyzerDefinition); } private static byte[] getBytes(InputStream inputStream) throws IOException { ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int num; while ((num = inputStream.read(buffer)) != -1) { outputStream.write(buffer, 0, num); } inputStream.close(); outputStream.close(); return outputStream.toByteArray(); } public static BlurAnalyzer create(String jsonStr) throws IOException { InputStream inputStream = new ByteArrayInputStream(jsonStr.getBytes()); try { return create(inputStream); } finally { inputStream.close(); } } public static BlurAnalyzer create(Path path) throws IOException { FileSystem fileSystem = FileSystem.get(path.toUri(), new Configuration()); FSDataInputStream inputStream = fileSystem.open(path); try { return create(inputStream); } finally { inputStream.close(); } } }