Java tutorial
/* * Reconciliation and Matching Framework * Copyright 2014 Royal Botanic Gardens, Kew * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.kew.rmf.core.lucene; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexableField; import org.kew.rmf.core.configuration.Configuration; import org.kew.rmf.core.configuration.Property; import org.kew.rmf.matchers.MatchException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A helper class to * map from Maps to Lucene Documents and vice versa * build a query string that lucene understands * check whether two strings match according to the configured * {@link org.kew.rmf.matchers.Matcher} */ public class LuceneUtils { private static Logger logger = LoggerFactory.getLogger(LuceneUtils.class); public static String doc2String(Document doc) { return doc2String(doc, ""); } public static Map<String, String> doc2Map(Document doc) { Map<String, String> map = new HashMap<String, String>(); for (IndexableField f : doc.getFields()) { map.put(f.name(), f.stringValue()); } return map; } public static Document map2Doc(Map<String, String> map) { Document doc = new Document(); for (String key : map.keySet()) { String value = map.get(key); value = (value != null) ? value : ""; doc.add(new TextField(key, value, Field.Store.YES)); } return doc; } public static String doc2String(Document doc, String prefix) { StringBuffer sb = new StringBuffer(); for (IndexableField f : doc.getFields()) { sb.append(prefix).append(f.name()).append(" : ").append(doc.getField(f.name()).stringValue()) .append("\n"); } return sb.toString(); } public static String doc2Line(Document doc, String fieldSeparator) { StringBuffer sb = new StringBuffer(); for (IndexableField f : doc.getFields()) { if (sb.length() > 0) sb.append(fieldSeparator); sb.append(doc.getField(f.name()).stringValue()); } return sb.toString(); } public static String buildQuery(List<Property> properties, Document doc, boolean dedupl) { Map<String, String> map = doc2Map(doc); return buildQuery(properties, map, dedupl); } public static String buildQuery(List<Property> properties, Map<String, String> map, boolean dedupl) { StringBuffer sb = new StringBuffer(); if (dedupl) { // Be sure not to return self: sb.append("NOT " + Configuration.ID_FIELD_NAME + ":" + map.get(Configuration.ID_FIELD_NAME)); } for (Property p : properties) { if (p.isUseInSelect() || p.isUseInNegativeSelect()) { String authorityName = p.getAuthorityColumnName() + Configuration.TRANSFORMED_SUFFIX; String value = map.get(p.getQueryColumnName() + Configuration.TRANSFORMED_SUFFIX); // super-csv treats blank as null, we don't for now value = (value != null) ? value : ""; String quotedValue = "\"" + value + "\""; if (p.isUseInSelect()) { if (StringUtils.isNotBlank(value)) { if (p.getMatcher().isExact()) { if (sb.length() > 0) sb.append(" AND "); sb.append(authorityName + ":" + quotedValue); } if (p.isIndexLength()) { int low = Math.max(0, value.length() - 2); int high = value.length() + 2; if (sb.length() > 0) sb.append(" AND "); sb.append(authorityName).append(Configuration.LENGTH_SUFFIX); sb.append(String.format(":[%02d TO %02d]", low, high)); } if (p.isIndexInitial()) { if (sb.length() > 0) sb.append(" AND "); sb.append(authorityName).append(Configuration.INITIAL_SUFFIX).append(':') .append(quotedValue.substring(0, 2)).append('"'); } if (p.isUseWildcard()) { if (sb.length() > 0) sb.append(" AND "); sb.append(authorityName).append(":") .append(quotedValue.subSequence(0, quotedValue.length() - 1)).append("~0.5\""); } } } else { // isUseInNegativeSelect if (StringUtils.isNotBlank(value)) { if (sb.length() > 0) { sb.append(" AND"); } sb.append(" NOT " + authorityName + ":" + quotedValue); } } } } return sb.toString(); } public static boolean recordsMatch(Document from, Document to, List<Property> properties) throws Exception { Map<String, String> map = doc2Map(from); return recordsMatch(map, to, properties); } public static boolean recordsMatch(Map<String, String> queryRecord, Document authorityRecord, List<Property> properties) throws MatchException { if (logger.isTraceEnabled()) { logger.trace("Comparing records: Q:{} A:{}", queryRecord.get(Configuration.ID_FIELD_NAME), authorityRecord.get(Configuration.ID_FIELD_NAME)); } boolean recordMatch = false; for (Property p : properties) { String queryName = p.getQueryColumnName() + Configuration.TRANSFORMED_SUFFIX; String authorityName = p.getAuthorityColumnName() + Configuration.TRANSFORMED_SUFFIX; String query = queryRecord.get(queryName); query = (query != null) ? query : ""; String authority = authorityRecord.get(authorityName); authority = (authority != null) ? authority : ""; boolean fieldMatch = false; if (p.isBlanksMatch()) { if (StringUtils.isBlank(query)) { fieldMatch = true; logger.trace("Q:'' ? A:'{}' true (blank query)", authority); } else if (StringUtils.isBlank(authority)) { fieldMatch = true; logger.trace("Q:'{}' ? A:'' true (blank authority)", query); } } if (!fieldMatch) { fieldMatch = p.getMatcher().matches(query, authority); logger.trace("Q:'{}' ? A:'{}' {}", query, authority, fieldMatch); } recordMatch = fieldMatch; if (!recordMatch) { logger.trace("Failed on {}", queryName); break; } } return recordMatch; } }