Java tutorial
/** * Copyright (c) 2011 eXtensible Catalog Organization * * This program is free software; you can redistribute it and/or modify it under the terms of the MIT/X11 license. The text of the * license can be found at http://www.opensource.org/licenses/mit-license.php and copy of the license can be found on the project * website http://www.extensiblecatalog.org/. */ package xc.mst.services.marcaggregation.matcher; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import xc.mst.bo.record.InputRecord; import xc.mst.bo.record.SaxMarcXmlRecord; import xc.mst.bo.record.marc.Field; import xc.mst.services.marcaggregation.MarcAggregationService; import xc.mst.services.marcaggregation.dao.MarcAggregationServiceDAO; /** * The System control number corresponds to the * <a href="http://www.loc.gov/marc/bibliographic/bd035.html">MARC 035 field</a> * * OCLC Number, i.e. 035a, when the prefix= (OCoLC) (The service must match on both the numeric identifier AND the prefix. * * Note, the original requirement was to only accept this as a matchpoint if (OCoLC) was the prefix. I never implemented it * that way, and now it turns out that the requirement has changed. Now, just make sure you have a valid 035a with a well-formed * prefix followed by an identifier, and save it as a matchpoint. * * A later requirement may be to modify to accept the matchpoint with NO prefix. Not yet implemented! * This requirement bounces back and forth, since: * 3/9/12 This just in, ignore the field if there is no prefix. * * Note that the XC MARC Normalization Service has steps to ensure that these identifiers are in a consistent format. * The prefix is defined as the characters within the parentheses. * OCLC numbers may also contain other letters BETWEEN the prefix and the prefix and the number itself. * These should be ignored in matching, as all OCLC numeric values are unique without the numbers. * E.g. (OCoLC)ocm12345 should match with (OCoLC)12345 but NOT with (NRU)12345. * * We save the entire string in the db, i.e. (OCoLC)ocm12345. * * * It shall be considered an error to have > 1 035$a with prefix (OCoLC), must test for this, and log it. * * 035$a * * @author Benjamin D. Anderson * @author John Brand * */ public class SystemControlNumberMatcher extends FieldMatcherService { // you can have multiple 035$a fields within a record (mult 035, each w/1 $a) // thus use a list of SCNData, the data values are the original string // and the normalized string parts - the numeric id and the prefix protected Map<Long, List<SCNData>> inputId2scn = new HashMap<Long, List<SCNData>>(); protected Map<Long, List<SCNData>> inputId2scn_unpersisted = new HashMap<Long, List<SCNData>>(); // multiple records might have the same normalized 035$a, this would be an indication of a match protected Map<SCNData, List<Long>> scn2inputIds = new HashMap<SCNData, List<Long>>(); protected Map<Integer, String> id2prefix = new HashMap<Integer, String>(); protected Map<Integer, String> id2prefix_unpersisted = new HashMap<Integer, String>(); protected Map<String, Integer> prefix2id = new HashMap<String, Integer>(); private static final Logger LOG = Logger.getLogger(SystemControlNumberMatcher.class); private boolean keepAllCached = false; //true; MarcAggregationService mas = null; // as a side effect populates prefix list, for now, only if it finds a non-blank prefix. protected String getPrefix(String s) { int start, end; if (s.contains("(")) { start = s.indexOf("("); if (s.contains(")")) { end = s.indexOf(")"); LOG.debug(s); final String prefix = (s.substring(start + 1, end)).toUpperCase(); // case-insensitive matching MST-538 Character first = prefix.charAt(0); // probably really need to be a 3 alpha prefix but for now make sure it starts with alpha. if (prefix != null && prefix.length() > 0) { if (Character.isLetter(first)) { LOG.debug("found a prefix of " + prefix); // threads need to lock - critical section synchronized (this) { if (!prefix2id.containsKey(prefix)) { int newId = prefix2id.size(); prefix2id.put(prefix, newId); id2prefix.put(newId, prefix); if (MarcAggregationService.hasIntermediatePersistence) { id2prefix_unpersisted.put(newId, prefix); } } } return prefix; } } } } return ""; } protected SCNData getMapId(String s) throws Exception { // return (getNumericId(s)*1000)+getPrefixId(s); final String prefix = getPrefix(s); String numericId = ""; int inx = s.toUpperCase().indexOf(prefix); // case-insensitive matching MST-538 if (inx >= 0) { numericId = s.substring(inx + prefix.length() + 1); } else { //LOG.error("** Problem with numeric ID in SCNData, prefix=" + prefix + " , original=" + s); throw new Exception("Bad SCN Data"); } if (!isSCNValid(numericId)) { //LOG.error("** Problem with numeric ID in SCNData, prefix=" + prefix + " , numeric ID=" + numericId); throw new Exception("Bad SCN Data"); } LOG.debug("mapID:" + prefix + numericId); return new SCNData(prefix, prefix2id.get(prefix), numericId, s); } @Override public List<Long> getMatchingInputIds(SaxMarcXmlRecord ir) { MarcAggregationServiceDAO masDao = getMAS().getMarcAggregationServiceDAO(); ArrayList<Long> results = new ArrayList<Long>(); List<Field> fields = ir.getDataFields(35); final Long id = new Long(ir.recordId); for (Field field : fields) { List<String> subfields = SaxMarcXmlRecord.getSubfieldOfField(field, 'a'); /* final int size = subfields.size(); if (size > 1) { LOG.error("ERROR: Multiple $a subfields in 035 in record! " + ir.recordId); }*/ for (String subfield : subfields) { SCNData goods = null; try { goods = getMapId(subfield); } catch (Exception e) { continue; } // for now don't consider 035$a if no prefix. if (goods.prefix.equals("")) continue; // look in memory first if (scn2inputIds.get(goods) != null) { List<Long> m = scn2inputIds.get(goods); if (m != null && m.size() > 0) { results.addAll(m); } if (results.contains(id)) { results.remove(id); } } // also, look in the database List<Long> records = masDao.getMatchingSCCNRecords(MarcAggregationServiceDAO.matchpoints_035a_table, MarcAggregationServiceDAO.input_record_id_field, MarcAggregationServiceDAO.numeric_id_field, MarcAggregationServiceDAO.prefix_id_field, goods); LOG.debug("SCN, DAO, getMatching records for " + goods + ", numResults=" + records.size()); for (Long record : records) { if (!record.equals(id)) { if (!results.contains(record)) { results.add(record); LOG.debug("**SCN, DAO, record id: " + record + " matches id " + id); } } } } } LOG.debug("getMatchingInputIds, irId=" + ir.recordId + " results.size=" + results.size()); return results; } /** * when a record is updated/deleted, need to use this to */ @Override public void removeRecordFromMatcher(InputRecord r) { Long id = new Long(r.getId()); List<SCNData> goodsList = inputId2scn.get(id); if (goodsList != null) { for (SCNData goodsFields : goodsList) { List<Long> idsList = scn2inputIds.get(goodsFields); if (idsList != null) { idsList.remove(id); if (idsList.size() > 0) { scn2inputIds.put(goodsFields, idsList); } else { scn2inputIds.remove(goodsFields); } } } } inputId2scn.remove(id); if (MarcAggregationService.hasIntermediatePersistence) { inputId2scn_unpersisted.remove(id); } // keep database in sync. Don't worry about the one-off performance hit...yet. MarcAggregationService s = getMAS(); s.getMarcAggregationServiceDAO().deleteMergeRow(MarcAggregationServiceDAO.matchpoints_035a_table, id); } // should be a max of 1 field returned. // * It shall be considered an error to have > 1 035$a with prefix (OCoLC), must test for this, and log it. @Override public void addRecordToMatcher(SaxMarcXmlRecord r, InputRecord ir) { List<Field> fields = r.getDataFields(35); for (Field field : fields) { List<String> subfields = SaxMarcXmlRecord.getSubfieldOfField(field, 'a'); for (String subfield : subfields) { Long id = new Long(r.recordId); String prefix = getPrefix(subfield); if (prefix.equals("")) { // must have a prefix to use as a match point. // TODO MST-503 continue; } SCNData goods = null; try { goods = getMapId(subfield); } catch (Exception e) { continue; } List<SCNData> goodsList = inputId2scn.get(id); if (goodsList == null || goodsList.size() == 0) { goodsList = new ArrayList<SCNData>(); goodsList.add(goods); inputId2scn.put(id, goodsList); if (MarcAggregationService.hasIntermediatePersistence) { inputId2scn_unpersisted.put(id, goodsList); } } else if (!goodsList.contains(goods)) { goodsList.add(goods); inputId2scn.put(id, goodsList); if (MarcAggregationService.hasIntermediatePersistence) { inputId2scn_unpersisted.put(id, goodsList); } } else { LOG.debug("we have already seen " + goods + " for recordId: " + r.recordId); } List<Long> idsList = scn2inputIds.get(goods); if (idsList == null || idsList.size() == 0) { idsList = new ArrayList<Long>(); idsList.add(id); scn2inputIds.put(goods, idsList); } else if (!idsList.contains(id)) { idsList.add(id); scn2inputIds.put(goods, idsList); } else { // error? LOG.debug("we have already seen " + id + " for recordId: " + r.recordId); } } } } /** * test to see if String is valid (not empty) * @param numeric string, allegedly * @return the trueness of it all */ private boolean isSCNValid(String numeric) { return !StringUtils.isBlank(numeric); } public boolean matchpointsHaveChanged(SaxMarcXmlRecord r, InputRecord ir) { LOG.debug("Sccn matchpointsHaveChanged? ID: " + ir.getId()); Map<Long, List<SCNData>> cachedListId2scn = getMAS().getMarcAggregationServiceDAO() .getSCCNRecordsCache(Long.valueOf(ir.getId())); LOG.debug("cachedListId2scn: " + cachedListId2scn); List<SCNData> cachedId2scn = new ArrayList<SCNData>(); if (cachedListId2scn.containsKey(ir.getId())) { cachedId2scn = cachedListId2scn.get(ir.getId()); LOG.debug("cachedId2scn: " + cachedId2scn); } List<SCNData> thisId2scn = new ArrayList<SCNData>(); List<Field> fields = r.getDataFields(35); for (Field field : fields) { List<String> subfields = SaxMarcXmlRecord.getSubfieldOfField(field, 'a'); for (String subfield : subfields) { String prefix = getPrefix(subfield); if (prefix.equals("")) { // must have a prefix to use as a match point. // TODO MST-503 continue; } SCNData goods = null; try { goods = getMapId(subfield); } catch (Exception e) { continue; } LOG.debug("adding thisId2scn: " + goods); thisId2scn.add(goods); } } LOG.error("gonna compare cachedId2scn: " + cachedId2scn + " ...with... thisId2scn: " + thisId2scn); Set<SCNData> setA = new HashSet<SCNData>(cachedId2scn); Set<SCNData> setB = new HashSet<SCNData>(thisId2scn); boolean same = setA.containsAll(thisId2scn) && setB.containsAll(cachedId2scn); return (!same); } private MarcAggregationService getMAS() { if (mas == null) { mas = (MarcAggregationService) config.getBean("MarcAggregationService"); } return mas; } // from db @Override public void load(boolean firstTime) { // we will only keep all objects in-memory for the initial (large) load; otherwise, we need to consult database too keepAllCached = firstTime; // we NEED to always load prefixes. (It's a small list, anyway, but very, very NECESSARY). MarcAggregationService s = getMAS(); id2prefix = s.getMarcAggregationServiceDAO().getPrefixes(); for (Integer id : id2prefix.keySet()) { prefix2id.put(id2prefix.get(id), id); } if (!keepAllCached) return; // Retrieve all match point integer data into memory, inputId2scn = s.getMarcAggregationServiceDAO().getSCCNRecordsCache(); LOG.info("inputId2scn loaded, size=" + inputId2scn.size()); // now go from inputId2scn to populate scn2inputIds for (Long id : inputId2scn.keySet()) { List<SCNData> ids = inputId2scn.get(id); for (SCNData goods : ids) { List<Long> idsList = scn2inputIds.get(goods); if (idsList == null || idsList.size() == 0) { idsList = new ArrayList<Long>(); idsList.add(id); scn2inputIds.put(goods, idsList); } else if (!idsList.contains(id)) { idsList.add(id); scn2inputIds.put(goods, idsList); } } } } // into db @Override public void flush(boolean force) { if (force) { MarcAggregationService s = getMAS(); if (MarcAggregationService.hasIntermediatePersistence) { s.getMarcAggregationServiceDAO().persistPrefixList(id2prefix_unpersisted, MarcAggregationServiceDAO.prefixes_035a_table); s.getMarcAggregationServiceDAO().persistSCNMatchpointMaps(inputId2scn_unpersisted, MarcAggregationServiceDAO.matchpoints_035a_table); id2prefix_unpersisted.clear(); inputId2scn_unpersisted.clear(); } else { s.getMarcAggregationServiceDAO().persistPrefixList(id2prefix, MarcAggregationServiceDAO.prefixes_035a_table); s.getMarcAggregationServiceDAO().persistSCNMatchpointMaps(inputId2scn/* _unpersisted */, MarcAggregationServiceDAO.matchpoints_035a_table); } // we persisted everything already; no need to keep in-memory objects too (but do keep id2prefix in memory!) if (!keepAllCached) { inputId2scn.clear(); scn2inputIds.clear(); } } } /** * For testing. (for my tests, the more reliable number was out of the db) */ public int getNumRecordIdsInMatcher() { //return inputId2scn.size(); MarcAggregationService s = getMAS(); LOG.debug("** 035 matcher contains " + s.getMarcAggregationServiceDAO() .getNumUniqueRecordIds(MarcAggregationServiceDAO.matchpoints_035a_table) + " unique records in dB & " + inputId2scn.size() + " records in mem."); return s.getMarcAggregationServiceDAO() .getNumUniqueRecordIds(MarcAggregationServiceDAO.matchpoints_035a_table); } //TODO check in dB too? // - seems to be unused, and I already don't recall what I used if for or thought I wanted it for. public Collection<Long> getRecordIdsInMatcher() { return inputId2scn.keySet(); } /** * For testing. */ public int getNumMatchPointsInMatcher() { //return scn2inputIds.size(); MarcAggregationService s = getMAS(); LOG.debug("** 035 matcher contains " + s.getMarcAggregationServiceDAO() .getNumUniqueNumericIds(MarcAggregationServiceDAO.matchpoints_035a_table) + " unique strings in dB & " + inputId2scn.size() + " strs in mem."); return s.getMarcAggregationServiceDAO() .getNumUniqueNumericIds(MarcAggregationServiceDAO.matchpoints_035a_table); } }