List of usage examples for org.apache.commons.codec.language DoubleMetaphone encode
public String encode(String value)
From source file:experimentos.LevenshteinExperimentCDQuID.java
public static void main(String[] args) throws Exception { // enables dynamic data-loading for file-based sorting GlobalConfig.getInstance().setInMemoryObjectThreshold(10000); // sets the CSV data source CSVSource dataSource = new CSVSource("cd", new File("cd.csv")); dataSource.enableHeader();//from ww w. j a v a 2 s . co m dataSource.addIdAttributes("pk"); // CSVSource goldstandardSource = new CSVSource("goldstandard", new File("cd_gold.csv")); // goldstandardSource.enableHeader(); // instantiate the gold standard // "cddb" is the source identifier //GoldStandard goldStandard = new GoldStandard(goldstandardSource); //goldStandard.setFirstElementsObjectIdAttributes("disc1_id"); //goldStandard.setSecondElementsObjectIdAttributes("disc2_id"); //goldStandard.setSourceIdLiteral("cddb"); // defines sub-keys that are used to generate the sorting key TextBasedSubkey artistSubkey = new TextBasedSubkey("artist"); artistSubkey.setIgnoredCharactersRegEx(TextBasedSubkey.NO_VOWELS_REGEX); DocumentFrequencyPreprocessor dfPreprocessor = new DocumentFrequencyPreprocessor("artist"); // the key generator uses sub-key selectors to generate a key for each object SortingKey sortingKey = new SortingKey(); sortingKey.addSubkey(artistSubkey); Algorithm algorithm = new SortedNeighborhoodMethod(sortingKey, 30); algorithm.addPreprocessor(dfPreprocessor); // enable in-memory storing algorithm.enableInMemoryProcessing(); // adds the "data" to the algorithm algorithm.addDataSource(dataSource); // instantiates similarity measure //SimilarityFunction similarityFunction = new TFIDFSimilarityFunction(dfPreprocessor, "title"); SimilarityFunction similarityFunction = new LevenshteinDistanceFunction("artist"); // DuDeOutput output = new CSVOutput(new File("saida.csv")); long start = System.currentTimeMillis(); // counts the generated object pairs int cnt = 0; int dupCnt = 0; int nondupCnt = 0; //Map<String, ArrayList<String>> mapaSimilares = new HashMap<String, ArrayList<String>>(); BlockIndex bi = new BlockIndex(); DoubleMetaphone db = new DoubleMetaphone(); // StatisticComponent statistic = new StatisticComponent(goldStandard, algorithm); for (DuDeObjectPair pair : algorithm) { if (similarityFunction.getSimilarity(pair) > 0.8) { ++dupCnt; String pk1 = pair.getFirstElement().getAttributeValue("pk").toString(); String pk2 = pair.getSecondElement().getAttributeValue("pk").toString(); String title1 = pair.getFirstElement().getAttributeValue("artist").toString(); String title2 = pair.getSecondElement().getAttributeValue("artist").toString(); String keyBlockpair1 = db.encode(title1); String keyBlockpair2 = db.encode(title2); //int cluster1 = bi.getclusterId(pk1, keyBlockpair1, "cd"); //int cluster2 = bi.getclusterId(pk2, keyBlockpair2, "cd"); Vertice v1 = new Vertice(pk1, "cd", 0); Vertice v2 = new Vertice(pk2, "cd", 0); bi.insertVertice(keyBlockpair1, v1); bi.insertVertice(keyBlockpair2, v2); // statistic.addDuplicate(pair); /**if ((cluster1 != -1) && (cluster2 == -1)) { Vertice v2 = new Vertice(pk2, "cd", cluster1); bi.insertVertice(keyBlockpair2, v2); caso1++; }else if ((cluster1 == -1) && (cluster2 != -1)) { Vertice v1 = new Vertice(pk1, "cd", cluster2); bi.insertVertice(keyBlockpair1, v1); caso2++; } else if ((cluster1 == -1) && (cluster2 == -1)) { Vertice v1 = new Vertice(pk1, "cd", cluster); Vertice v2 = new Vertice(pk2, "cd", cluster); bi.insertVertice(keyBlockpair1, v1); bi.insertVertice(keyBlockpair2, v2); cluster++; caso3++; }**/ // System.err.println( pair.getFirstElement().getAttributeValue("title").toString()); } else { ++nondupCnt; // statistic.addNonDuplicate(pair); } ++cnt; } //bi.printBlockIndex(); // System.err.println(" numero total de elementor " + bi.getNumeroElementos()); //StatisticOutput statisticOutput = new SimpleStatisticOutput(System.out, statistic); // statisticOutput.writeStatistics(); algorithm.cleanUp(); // print statistics // System.out.println(); // System.out.println(); System.err.println( dupCnt + " duplicates out of " + cnt + " pairs detected in " + (System.currentTimeMillis() - start) + " ms " + bi.getNumeroElementos() + " " + bi.getNumeroBlocos()); //System.err.println(" casos " + caso1 + " " + caso2 + " " + caso3 + " "); QueryExperimento query = new QueryExperimento(bi); query.query(); }
From source file:it.univpm.deit.semedia.musicuri.core.Toolset.java
/** * Genarates a list of terms that are the metaphone equivalents of the words in the given list. * The terms are generated using the double metaphone phonetic maching algorithm (apache implementation) * @param keywords an aArrayList object containing the keywords to generate metaphones for * @return an aArrayList object containing the generated metaphone equivalent terms *//* www. ja va 2s. co m*/ public static ArrayList GenerateMetaphones(ArrayList keywords) { ArrayList metaphoneList = new ArrayList(keywords.size()); DoubleMetaphone meta = new DoubleMetaphone(); String tmp = null; for (int i = 0; i < keywords.size(); i++) { tmp = meta.encode((String) keywords.get(i)); metaphoneList.add(tmp); } return metaphoneList; }
From source file:com.vangent.hieos.empi.transform.DoubleMetaphoneTransformFunction.java
/** * /*from w w w . jav a2s . c om*/ * @param obj * @return */ public Object transform(Object obj) { DoubleMetaphone encoder = new DoubleMetaphone(); return encoder.encode((String) obj); }
From source file:com.puppetlabs.geppetto.pp.dsl.contentassist.PPProposalsGenerator.java
/** * Computes attribute proposals where the class/definition name must match exactly, but where * parameters are processed with fuzzy logic. * //from w ww . j a v a 2s. co m * @param currentName * @param descs * @param searchPath * TODO * @param types * @return */ public String[] computeAttributeProposals(final QualifiedName currentName, Collection<IEObjectDescription> descs, PPSearchPath searchPath) { if (currentName.getSegmentCount() < 2) return new String[0]; final DoubleMetaphone encoder = new DoubleMetaphone(); final String metaphoneName = encoder.encode(currentName.getLastSegment()); Collection<String> proposals = generateAttributeCandidates(currentName, descs, searchPath); // propose all, but sort them based on likeness String[] result = new String[proposals.size()]; proposals.toArray(result); Arrays.sort(result, new PronunciationComparator(encoder, metaphoneName)); return result; }
From source file:com.puppetlabs.geppetto.pp.dsl.contentassist.PPProposalsGenerator.java
/** * Attempts to produce a list of names that are close to the given name. At most 5 proposals * are generated. The returned proposals are made in order of "pronunciation distance" which is * obtained by taking the Levenshtein distance between the Double Monophone encodings of * candidate and given name. Candidates are selected as the names with shortest Levenshtein distance * and names that are Monophonically equal, or starts or ends monophonically. * /*ww w.ja va2 s . co m*/ * @param currentName * the name for which proposals are to be generated * @param descs * the descriptors of available named values * @param searchPath * TODO * @param types * if stated, the wanted types of named values * @return * array of proposals, possibly empty, but never null. */ public String[] computeProposals(final String currentName, Collection<IEObjectDescription> descs, boolean upperCaseProposals, PPSearchPath searchPath, EClass... types) { if (currentName == null || currentName.length() < 1) return new String[0]; // compute the 5 best matches and only accept if score <= 5 ScoreKeeper<IEObjectDescription> tracker = new ScoreKeeper<IEObjectDescription>(5, false, 5); // List<IEObjectDescription> metaphoneAlike = Lists.newArrayList(); final DoubleMetaphone encoder = new DoubleMetaphone(); final String metaphoneName = encoder.encode(currentName); for (IEObjectDescription d : descs) { EClass c = d.getEClass(); typeok: if (types != null && types.length > 0) { for (EClass wanted : types) if ((wanted == c || wanted.isSuperTypeOf(c))) break typeok; continue; } // filter based on path visibility if (searchPath.searchIndexOf(d) == -1) continue; // not visible according to path String candidateName = converter.toString(d.getName()); tracker.addScore(StringUtils.getLevenshteinDistance(currentName, candidateName), d); String candidateMetaphone = encoder.encode(candidateName); // metaphone matches are scored on the pronounciation distance if (metaphoneName.equals(candidateMetaphone) // || candidateMetaphone.startsWith(metaphoneName) // || candidateMetaphone.endsWith(metaphoneName) // ) tracker.addScore(StringUtils.getLevenshteinDistance(metaphoneName, candidateMetaphone), d); // System.err.printf("Metaphone alike: %s == %s\n", currentName, candidateName); } List<String> result = Lists.newArrayList(); // System.err.print("Scores = "); for (ScoreEntry<IEObjectDescription> entry : tracker.getScoreEntries()) { String s = converter.toString(entry.getData().getName()); result.add(s); // System.err.printf("%d %s, ", entry.getScore(), s); } // System.err.println(); String[] proposals = result.toArray(new String[result.size()]); PronunciationComparator x = new PronunciationComparator(encoder, metaphoneName); Arrays.sort(proposals, x); // System.err.print("Order = "); // for(int i = 0; i < proposals.length; i++) // System.err.printf("%s, ", proposals[i]); // System.err.println(); return upperCaseProposals ? toUpperCaseProposals(proposals) : proposals; }
From source file:org.openregistry.core.domain.AbstractNameImpl.java
protected final String generateSoundEx(final String comparison) { final DoubleMetaphone dmp = new DoubleMetaphone(); return dmp.encode(comparison); }
From source file:org.vivoweb.harvester.score.algorithm.NormalizedDoubleMetaphoneDifference.java
@Override public float calculate(CharSequence itemX, CharSequence itemY) { if (itemX.length() == 0 || itemY.length() == 0) { return 0f; }/*from w ww. j av a 2s . c o m*/ DoubleMetaphone dm = new DoubleMetaphone(); String dmX = dm.encode(itemX.toString()); String dmY = dm.encode(itemY.toString()); return new NormalizedLevenshteinDifference().calculate(dmX, dmY); }
From source file:query.Amostra.java
public BlockIndex blocaDadosDaAmostraConsulta(CSVSource dataSource) { BlockIndex bi2 = new BlockIndex(); //SoundEx db = new SoundEx(); DoubleMetaphone db = new DoubleMetaphone(); for (Iterator<DuDeObject> iterator = dataSource.iterator(); iterator.hasNext();) { DuDeObject next = iterator.next(); if (encontraKeySelecionada(next.getAttributeValue("key").toString())) { String pk = next.getAttributeValue("pk").toString(); String block = next.getAttributeValue("title").toString(); String block2 = next.getAttributeValue("artist").toString(); String keyBlock = db.encode(block); //String keyBlock = db.getSoundEx(block2); Vertice v1 = new Vertice(pk, "cd", -1, block, block2); bi2.insertVertice(keyBlock, v1); this.dadosEntrada.add(v1); // System.out.println(" Pegou Id " + next.getAttributeValue("title").toString() ); }//from w w w. j ava2s . co m } return bi2; }
From source file:query.QueryExperimento.java
public void query() throws FileNotFoundException { GlobalConfig.getInstance().setInMemoryObjectThreshold(1000); // sets the CSV data source CSVSource dataSource = new CSVSource("cd", new File("cd.csv")); dataSource.enableHeader();// w w w . j a v a 2 s . com dataSource.addIdAttributes("pk"); long start = System.currentTimeMillis(); DoubleMetaphone db = new DoubleMetaphone(); int achou = 0; int nAchou = 0; int total = 0; for (Iterator<DuDeObject> iterator = dataSource.iterator(); iterator.hasNext();) { DuDeObject next = iterator.next(); String pk = next.getAttributeValue("pk").toString(); String block = next.getAttributeValue("artist").toString(); String keyBlock = db.encode(block); boolean clusterId = bi.getId(pk, keyBlock, "cd"); total++; if (clusterId) { achou++; } else { nAchou++; } } System.err.println((System.currentTimeMillis() - start) + " ms"); System.err.println("total " + total + " n achou " + nAchou + "Achou " + achou); }
From source file:query.QueryExperimento.java
/** * * @param dataSource/*from ww w .j a va 2 s . co m*/ * @param tamanho porcentagem de elementos que n]ao se deseja guardar informaes * @return as tuplas que se deseja ter informaes em um blooco que sera processado */ public BlockIndex blocaConsultaReduzidaFixa(CSVSource dataSource, int tamanho) { BlockIndex bi2 = new BlockIndex(); DoubleMetaphone db = new DoubleMetaphone(); int numeroElementos = 0; for (Iterator<DuDeObject> iterator = dataSource.iterator(); iterator.hasNext();) { DuDeObject next = iterator.next(); if (numeroElementos < tamanho) { String pk = next.getAttributeValue("pk").toString(); String block = next.getAttributeValue("title").toString(); String block2 = next.getAttributeValue("artist").toString(); String keyBlock = db.encode(block); Vertice v1 = new Vertice(pk, "cd", -1, block, block2); bi2.insertVertice(keyBlock, v1); numeroElementos++; } else { numeroElementos++; } } System.out.println(" Tamanho bloco " + bi2.getNumeroElementos()); return bi2; }