Java tutorial
// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataquality.semantic.recognizer; import java.io.IOException; import java.util.*; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.talend.dataquality.semantic.api.CategoryRegistryManager; import org.talend.dataquality.semantic.classifier.custom.UserDefinedClassifier; import org.talend.dataquality.semantic.classifier.impl.DataDictFieldClassifier; import org.talend.dataquality.semantic.index.Index; import org.talend.dataquality.semantic.model.DQCategory; import org.talend.dataquality.semantic.model.MainCategory; /** * created by talend on 2015-07-28 Detailled comment. * */ class DefaultCategoryRecognizer implements CategoryRecognizer { private final List<CategoryFrequency> catList = new ArrayList<>(); private final Map<String, CategoryFrequency> categoryToFrequency = new HashMap<>(); private final DataDictFieldClassifier dataDictFieldClassifier; private final UserDefinedClassifier userDefineClassifier; private final LFUCache<String, Set<String>> knownCategoryCache = new LFUCache<String, Set<String>>(10, 1000, 0.01f); private long emptyCount = 0; private long total = 0; private CategoryRegistryManager crm; public DefaultCategoryRecognizer(Index dictionary, Index keyword) throws IOException { this(dictionary, keyword, CategoryRegistryManager.getInstance().getRegexClassifier(true)); } public DefaultCategoryRecognizer(Index dictionary, Index keyword, UserDefinedClassifier regex) throws IOException { dataDictFieldClassifier = new DataDictFieldClassifier(dictionary, keyword); this.userDefineClassifier = regex; crm = CategoryRegistryManager.getInstance(); } @Override public DataDictFieldClassifier getDataDictFieldClassifier() { return dataDictFieldClassifier; } @Override public UserDefinedClassifier getUserDefineClassifier() { return userDefineClassifier; } @Override public CategoryRegistryManager getCrm() { return crm; } /** * @param data the input value * @return the set of its semantic categories */ public Set<String> getSubCategorySet(String data) { if (data == null || StringUtils.EMPTY.equals(data.trim())) { emptyCount++; return new HashSet<>(); } final Set<String> knownCategory = knownCategoryCache.get(data); if (knownCategory != null) { return knownCategory; } MainCategory mainCategory = MainCategory.getMainCategory(data); Set<String> subCategorySet = new HashSet<>(); switch (mainCategory) { case Alpha: case AlphaNumeric: subCategorySet.addAll(dataDictFieldClassifier.classify(data)); if (userDefineClassifier != null) { subCategorySet.addAll(userDefineClassifier.classify(data, mainCategory)); } knownCategoryCache.put(data, subCategorySet); break; case Numeric: if (userDefineClassifier != null) { subCategorySet.addAll(userDefineClassifier.classify(data, mainCategory)); } knownCategoryCache.put(data, subCategorySet); break; case NULL: case BLANK: emptyCount++; break; case UNKNOWN: break; } return subCategorySet; } @Override public void prepare() { // dictionary.initIndex(); // keyword.initIndex(); } @Override public void reset() { catList.clear(); categoryToFrequency.clear(); total = 0; emptyCount = 0; knownCategoryCache.clear(); } /* * (non-Javadoc) * * @see org.talend.dataquality.semantic.recognizer.CategoryRecognizer#process(java.lang.String) */ @Override public String[] process(String data) { Set<String> ids = getSubCategorySet(data); Set<String> categories = new HashSet<>(); if (!ids.isEmpty()) { for (String id : ids) { DQCategory meta = crm.getCategoryMetadataById(id); if (meta != null) { incrementCategory(meta.getName(), meta.getLabel()); if (!CollectionUtils.isEmpty(meta.getParents())) incrementAncestorsCategories(categories, id); categories.add(meta.getName()); } } } else { incrementCategory(StringUtils.EMPTY); } total++; return categories.toArray(new String[categories.size()]); } /** * For the discovery, if a category c matches with the data, * it means all the ancestor categories of c have to match too. * This method increments the ancestor categories of c. * * @param categories, the category result * @param id, the category ID of the matched category c * */ private void incrementAncestorsCategories(Set<String> categories, String id) { Deque<Pair<String, Integer>> catToSee = new ArrayDeque<>(); Set<String> catAlreadySeen = new HashSet<>(); catToSee.add(Pair.of(id, 0)); Pair<String, Integer> currentCategory; while (!catToSee.isEmpty()) { currentCategory = catToSee.pop(); DQCategory dqCategory = crm.getCategoryMetadataById(currentCategory.getLeft()); if (dqCategory != null && !CollectionUtils.isEmpty(dqCategory.getParents())) { int parentLevel = currentCategory.getRight() + 1; for (DQCategory parent : dqCategory.getParents()) { if (!catAlreadySeen.contains(parent.getId())) { catAlreadySeen.add(parent.getId()); catToSee.add(Pair.of(parent.getId(), parentLevel)); DQCategory meta = crm.getCategoryMetadataById(parent.getId()); if (meta != null) { incrementCategory(meta.getName(), meta.getLabel(), parentLevel); categories.add(meta.getName()); } } } } } } private void incrementCategory(String catId) { incrementCategory(catId, catId); } private void incrementCategory(String catId, String catName) { incrementCategory(catId, catName, 0); } private void incrementCategory(String catId, String catName, int categoryLevel) { CategoryFrequency c = categoryToFrequency.get(catId); if (c == null) { c = new CategoryFrequency(catId, catName, categoryLevel); categoryToFrequency.put(catId, c); catList.add(c); } c.count++; } @Override public Collection<CategoryFrequency> getResult() { for (CategoryFrequency category : categoryToFrequency.values()) { category.frequency = Math.round(category.count * 10000 / total) / 100F; } Collections.sort(catList, new Comparator<CategoryFrequency>() { @Override public int compare(CategoryFrequency o1, CategoryFrequency o2) { // The EMPTY category must always be ranked after the others if ("".equals(o1.categoryName)) { return 1; } else if ("".equals(o2.categoryName)) { return -1; } return (int) (o2.count - o1.count); } }); return catList; } @Override public void end() { dataDictFieldClassifier.closeIndex(); knownCategoryCache.clear(); } }