Java tutorial
/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * See LICENSE.txt included in this distribution for the specific * language governing permissions and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at LICENSE.txt. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2018 Oracle and/or its affiliates. All rights reserved. */ package org.opengrok.suggest; import org.apache.commons.io.FileUtils; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; import org.apache.lucene.search.spell.LuceneDictionary; import org.apache.lucene.search.suggest.InputIterator; import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.opengrok.suggest.popular.PopularityCounter; import org.opengrok.suggest.popular.PopularityMap; import org.opengrok.suggest.popular.impl.chronicle.ChronicleMapAdapter; import org.opengrok.suggest.popular.impl.chronicle.ChronicleMapConfiguration; import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.logging.Level; import java.util.logging.Logger; /** * Holds all the necessary data for one index directory. In the context of OpenGrok it is one project. */ class SuggesterProjectData implements Closeable { private static final String TMP_DIR_PROPERTY = "java.io.tmpdir"; private static final Logger logger = Logger.getLogger(SuggesterProjectData.class.getName()); private static final int MAX_TERM_SIZE = Short.MAX_VALUE - 3; private static final String WFST_TEMP_FILE_PREFIX = "opengrok_suggester_wfst"; private static final String WFST_FILE_SUFFIX = ".wfst"; private static final String SEARCH_COUNT_MAP_NAME = "search_count.db"; private static final String VERSION_FILE_NAME = "version.txt"; private static final int DEFAULT_WEIGHT = 0; private static final double AVERAGE_LENGTH_DEFAULT = 22; private Directory indexDir; private Path suggesterDir; private final Map<String, WFSTCompletionLookup> lookups = new HashMap<>(); private final Map<String, PopularityMap> searchCountMaps = new HashMap<>(); private final Map<String, Double> averageLengths = new HashMap<>(); private boolean allowMostPopular; private final ReadWriteLock lock = new ReentrantReadWriteLock(); private Set<String> fields; private final Directory tempDir; SuggesterProjectData(final Directory indexDir, final Path suggesterDir, final boolean allowMostPopular, final Set<String> fields) throws IOException { this.indexDir = indexDir; this.suggesterDir = suggesterDir; this.allowMostPopular = allowMostPopular; tempDir = FSDirectory.open(Paths.get(System.getProperty(TMP_DIR_PROPERTY))); initFields(fields); } private void initFields(final Set<String> fields) throws IOException { try (IndexReader indexReader = DirectoryReader.open(indexDir)) { Collection<String> indexedFields = MultiFields.getIndexedFields(indexReader); if (fields == null) { this.fields = new HashSet<>(indexedFields); } else if (!indexedFields.containsAll(fields)) { Set<String> copy = new HashSet<>(fields); copy.removeAll(indexedFields); logger.log(Level.WARNING, "Fields {0} will be ignored because they were not found in index directory {1}", new Object[] { copy, indexDir }); copy = new HashSet<>(fields); copy.retainAll(indexedFields); this.fields = copy; } else { this.fields = new HashSet<>(fields); } } } /** * Initializes the data structure. Rebuild is launched only if necessary. * @throws IOException if initialization was not successful */ public void init() throws IOException { lock.writeLock().lock(); try { long commitVersion = getCommitVersion(); if (hasStoredData() && commitVersion == getDataVersion()) { loadStoredWFSTs(); } else { createSuggesterDir(); build(); } if (allowMostPopular) { initSearchCountMap(); } storeDataVersion(commitVersion); } finally { lock.writeLock().unlock(); } } private long getCommitVersion() throws IOException { List<IndexCommit> commits = DirectoryReader.listCommits(indexDir); if (commits.size() > 1) { throw new IllegalStateException( "IndexDeletionPolicy changed, normally only one commit should be stored"); } IndexCommit commit = commits.get(0); return commit.getGeneration(); } private boolean hasStoredData() { if (!suggesterDir.toFile().exists()) { return false; } File[] children = suggesterDir.toFile().listFiles(); return children != null && children.length > 0; } private void loadStoredWFSTs() throws IOException { try (IndexReader indexReader = DirectoryReader.open(indexDir)) { for (String field : fields) { File WFSTfile = getWFSTFile(field); if (WFSTfile.exists()) { WFSTCompletionLookup WFST = loadStoredWFST(WFSTfile); lookups.put(field, WFST); } else { logger.log(Level.INFO, "Missing WFST file for {0} field in {1}, creating a new one", new Object[] { field, suggesterDir }); WFSTCompletionLookup lookup = build(indexReader, field); store(lookup, field); lookups.put(field, lookup); } } } } private WFSTCompletionLookup loadStoredWFST(final File file) throws IOException { try (FileInputStream fis = new FileInputStream(file)) { WFSTCompletionLookup lookup = createWFST(); lookup.load(fis); return lookup; } } private WFSTCompletionLookup createWFST() { return new WFSTCompletionLookup(tempDir, WFST_TEMP_FILE_PREFIX); } private File getWFSTFile(final String field) { return getFile(field + WFST_FILE_SUFFIX); } private File getFile(final String fileName) { return suggesterDir.resolve(fileName).toFile(); } /** * Forces the rebuild of the data structure. * @throws IOException if some error occurred */ public void rebuild() throws IOException { lock.writeLock().lock(); try { build(); if (allowMostPopular) { initSearchCountMap(); } storeDataVersion(getCommitVersion()); } finally { lock.writeLock().unlock(); } } private void build() throws IOException { try (IndexReader indexReader = DirectoryReader.open(indexDir)) { for (String field : fields) { WFSTCompletionLookup lookup = build(indexReader, field); store(lookup, field); lookups.put(field, lookup); } } } private WFSTCompletionLookup build(final IndexReader indexReader, final String field) throws IOException { WFSTInputIterator iterator = new WFSTInputIterator( new LuceneDictionary(indexReader, field).getEntryIterator(), indexReader, field, getSearchCounts(field)); WFSTCompletionLookup lookup = createWFST(); lookup.build(iterator); double averageLength = (double) iterator.termLengthAccumulator / lookup.getCount(); averageLengths.put(field, averageLength); return lookup; } private void store(final WFSTCompletionLookup WFST, final String field) throws IOException { FileOutputStream fos = new FileOutputStream(getWFSTFile(field)); WFST.store(fos); } private void createSuggesterDir() throws IOException { if (!suggesterDir.toFile().exists()) { boolean directoryCreated = suggesterDir.toFile().mkdirs(); if (!directoryCreated) { throw new IOException("Could not create suggester directory " + suggesterDir); } } } private void initSearchCountMap() throws IOException { searchCountMaps.values().forEach(PopularityMap::close); searchCountMaps.clear(); for (String field : fields) { ChronicleMapConfiguration conf = ChronicleMapConfiguration.load(suggesterDir, field); if (conf == null) { // it was not yet initialized conf = new ChronicleMapConfiguration((int) lookups.get(field).getCount(), getAverageLength(field)); conf.save(suggesterDir, field); } File f = getChronicleMapFile(field); ChronicleMapAdapter m; try { m = new ChronicleMapAdapter(field, conf.getAverageKeySize(), conf.getEntries(), f); } catch (Exception e) { logger.log(Level.SEVERE, "Could not create ChronicleMap, most popular completion disabled, if you are using " + "JDK9+ make sure to specify: " + "--add-exports java.base/jdk.internal.ref=ALL-UNNAMED " + "--add-exports java.base/jdk.internal.misc=ALL-UNNAMED " + "--add-exports java.base/sun.nio.ch=ALL-UNNAMED", e); return; } if (getCommitVersion() != getDataVersion()) { removeOldTerms(m, lookups.get(field)); if (conf.getEntries() < lookups.get(field).getCount()) { int newEntriesCount = (int) lookups.get(field).getCount(); double newKeyAvgLength = getAverageLength(field); conf.setEntries(newEntriesCount); conf.setAverageKeySize(newKeyAvgLength); conf.save(suggesterDir, field); m.resize(newEntriesCount, newKeyAvgLength); } } searchCountMaps.put(field, m); } } private File getChronicleMapFile(final String field) { return suggesterDir.resolve(field + "_" + SEARCH_COUNT_MAP_NAME).toFile(); } private double getAverageLength(final String field) { if (averageLengths.containsKey(field)) { return averageLengths.get(field); } logger.log(Level.FINE, "Could not determine average length for field {0}, using default one", field); return AVERAGE_LENGTH_DEFAULT; } private void removeOldTerms(final ChronicleMapAdapter adapter, final WFSTCompletionLookup lookup) { adapter.removeIf(key -> lookup.get(key.toString()) == null); } /** * Looks up the terms in the WFST data structure. * @param field term field * @param prefix prefix the returned terms must contain * @param resultSize number of terms to return * @return terms with highest score */ public List<Lookup.LookupResult> lookup(final String field, final String prefix, final int resultSize) { lock.readLock().lock(); try { WFSTCompletionLookup lookup = lookups.get(field); if (lookup == null) { logger.log(Level.WARNING, "No WFST for field {0} in {1}", new Object[] { field, suggesterDir }); return Collections.emptyList(); } return lookup.lookup(prefix, false, resultSize); } catch (IOException e) { logger.log(Level.WARNING, "Could not perform lookup in {0} for {1}:{2}", new Object[] { suggesterDir, field, prefix }); } finally { lock.readLock().unlock(); } return Collections.emptyList(); } /** * Removes all stored data structures. */ public void remove() { lock.writeLock().lock(); try { try { close(); } catch (IOException e) { logger.log(Level.WARNING, "Could not close opened index directory {0}", indexDir); } try { FileUtils.deleteDirectory(suggesterDir.toFile()); } catch (IOException e) { logger.log(Level.WARNING, "Cannot remove suggester data: {0}", suggesterDir); } } finally { lock.writeLock().unlock(); } } /** * Increments search count for {@code term} by 1. * @param term term for which to increment search count */ public void incrementSearchCount(final Term term) { incrementSearchCount(term, 1); } /** * Increments search count for {@code term} by {@code value}. * @param term term for which to increment search count * @param value value to increment by */ public void incrementSearchCount(final Term term, final int value) { if (term == null) { throw new IllegalArgumentException("Cannot increment search count for null"); } boolean gotLock = lock.readLock().tryLock(); if (!gotLock) { // do not wait for rebuild return; } try { if (lookups.get(term.field()).get(term.text()) == null) { return; // unknown term } PopularityMap map = searchCountMaps.get(term.field()); if (map != null) { map.increment(term.bytes(), value); } } finally { lock.readLock().unlock(); } } /** * Returns search counts for term field. For the time the returned data structure is used this object needs to be * locked by {@link #tryLock()}. * @param field term field * @return search counts object */ public PopularityCounter getSearchCounts(final String field) { if (!searchCountMaps.containsKey(field)) { return key -> 0; } return key -> searchCountMaps.get(field).get(key); } /** * Closes the open data structures. * @throws IOException if the index directory could not be closed */ @Override public void close() throws IOException { lock.writeLock().lock(); try { searchCountMaps.values().forEach(val -> { try { val.close(); } catch (Exception e) { logger.log(Level.WARNING, "Could not properly close most popular completion data", e); } }); indexDir.close(); tempDir.close(); } finally { lock.writeLock().unlock(); } } private long getDataVersion() { File versionFile = getFile(VERSION_FILE_NAME); if (!versionFile.exists()) { return -1; } try { String str = FileUtils.readFileToString(versionFile, StandardCharsets.UTF_8); return Long.parseLong(str); } catch (IOException e) { logger.log(Level.WARNING, "Could not read suggester data version", e); } return -1; } private void storeDataVersion(final long version) { try { FileUtils.writeStringToFile(getFile(VERSION_FILE_NAME), "" + version, StandardCharsets.UTF_8); } catch (IOException e) { logger.log(Level.WARNING, "Could not store version", e); } } /** * Tries to lock the inner data structures for reading, so far only for {@link #getSearchCounts(String)}. * @return {@code true} if lock was acquired, {@code false} otherwise */ public boolean tryLock() { return lock.readLock().tryLock(); } /** * Unlocks the inner data structures for reading. */ public void unlock() { lock.readLock().unlock(); } /** * Returns the searched terms sorted according to their popularity. * @param field field for which to return the data * @param page which page of data to retrieve * @param pageSize number of results to return * @return list of terms with their popularity */ public List<Entry<BytesRef, Integer>> getSearchCountsSorted(final String field, int page, int pageSize) { lock.readLock().lock(); try { PopularityMap map = searchCountMaps.get(field); if (map == null) { logger.log(Level.FINE, "No search count map initialized for field {0}", field); return Collections.emptyList(); } return map.getPopularityData(page, pageSize); } finally { lock.readLock().unlock(); } } @Override public String toString() { return "SuggesterProjectData{" + "indexDir=" + indexDir + ", suggesterDir=" + suggesterDir + ", allowMostPopular=" + allowMostPopular + '}'; } /** * An {@link InputIterator} for WFST data structure with most popular completion support. */ private static class WFSTInputIterator implements InputIterator { private final InputIterator wrapped; private final IndexReader indexReader; private final String field; private long termLengthAccumulator = 0; private final PopularityCounter searchCounts; WFSTInputIterator(final InputIterator wrapped, final IndexReader indexReader, final String field, final PopularityCounter searchCounts) { this.wrapped = wrapped; this.indexReader = indexReader; this.field = field; this.searchCounts = searchCounts; } private BytesRef last; @Override public long weight() { if (last != null) { int add = searchCounts.get(last); return SuggesterUtils.computeScore(indexReader, field, last) + add * SuggesterSearcher.TERM_ALREADY_SEARCHED_MULTIPLIER; } return DEFAULT_WEIGHT; } @Override public BytesRef payload() { return wrapped.payload(); } @Override public boolean hasPayloads() { return wrapped.hasPayloads(); } @Override public Set<BytesRef> contexts() { return wrapped.contexts(); } @Override public boolean hasContexts() { return wrapped.hasContexts(); } @Override public BytesRef next() throws IOException { last = wrapped.next(); // skip very large terms because of the buffer exception while (last != null && last.length > MAX_TERM_SIZE) { last = wrapped.next(); } if (last != null) { termLengthAccumulator += last.length; } return last; } } }