List of usage examples for org.apache.commons.codec.language DoubleMetaphone DoubleMetaphone
public DoubleMetaphone()
From source file:experimentos.LevenshteinExperimentCDQuID.java
public static void main(String[] args) throws Exception { // enables dynamic data-loading for file-based sorting GlobalConfig.getInstance().setInMemoryObjectThreshold(10000); // sets the CSV data source CSVSource dataSource = new CSVSource("cd", new File("cd.csv")); dataSource.enableHeader();//from w ww .ja va 2 s .com dataSource.addIdAttributes("pk"); // CSVSource goldstandardSource = new CSVSource("goldstandard", new File("cd_gold.csv")); // goldstandardSource.enableHeader(); // instantiate the gold standard // "cddb" is the source identifier //GoldStandard goldStandard = new GoldStandard(goldstandardSource); //goldStandard.setFirstElementsObjectIdAttributes("disc1_id"); //goldStandard.setSecondElementsObjectIdAttributes("disc2_id"); //goldStandard.setSourceIdLiteral("cddb"); // defines sub-keys that are used to generate the sorting key TextBasedSubkey artistSubkey = new TextBasedSubkey("artist"); artistSubkey.setIgnoredCharactersRegEx(TextBasedSubkey.NO_VOWELS_REGEX); DocumentFrequencyPreprocessor dfPreprocessor = new DocumentFrequencyPreprocessor("artist"); // the key generator uses sub-key selectors to generate a key for each object SortingKey sortingKey = new SortingKey(); sortingKey.addSubkey(artistSubkey); Algorithm algorithm = new SortedNeighborhoodMethod(sortingKey, 30); algorithm.addPreprocessor(dfPreprocessor); // enable in-memory storing algorithm.enableInMemoryProcessing(); // adds the "data" to the algorithm algorithm.addDataSource(dataSource); // instantiates similarity measure //SimilarityFunction similarityFunction = new TFIDFSimilarityFunction(dfPreprocessor, "title"); SimilarityFunction similarityFunction = new LevenshteinDistanceFunction("artist"); // DuDeOutput output = new CSVOutput(new File("saida.csv")); long start = System.currentTimeMillis(); // counts the generated object pairs int cnt = 0; int dupCnt = 0; int nondupCnt = 0; //Map<String, ArrayList<String>> mapaSimilares = new HashMap<String, ArrayList<String>>(); BlockIndex bi = new BlockIndex(); DoubleMetaphone db = new DoubleMetaphone(); // StatisticComponent statistic = new StatisticComponent(goldStandard, algorithm); for (DuDeObjectPair pair : algorithm) { if (similarityFunction.getSimilarity(pair) > 0.8) { ++dupCnt; String pk1 = pair.getFirstElement().getAttributeValue("pk").toString(); String pk2 = pair.getSecondElement().getAttributeValue("pk").toString(); String title1 = pair.getFirstElement().getAttributeValue("artist").toString(); String title2 = pair.getSecondElement().getAttributeValue("artist").toString(); String keyBlockpair1 = db.encode(title1); String keyBlockpair2 = db.encode(title2); //int cluster1 = bi.getclusterId(pk1, keyBlockpair1, "cd"); //int cluster2 = bi.getclusterId(pk2, keyBlockpair2, "cd"); Vertice v1 = new Vertice(pk1, "cd", 0); Vertice v2 = new Vertice(pk2, "cd", 0); bi.insertVertice(keyBlockpair1, v1); bi.insertVertice(keyBlockpair2, v2); // statistic.addDuplicate(pair); /**if ((cluster1 != -1) && (cluster2 == -1)) { Vertice v2 = new Vertice(pk2, "cd", cluster1); bi.insertVertice(keyBlockpair2, v2); caso1++; }else if ((cluster1 == -1) && (cluster2 != -1)) { Vertice v1 = new Vertice(pk1, "cd", cluster2); bi.insertVertice(keyBlockpair1, v1); caso2++; } else if ((cluster1 == -1) && (cluster2 == -1)) { Vertice v1 = new Vertice(pk1, "cd", cluster); Vertice v2 = new Vertice(pk2, "cd", cluster); bi.insertVertice(keyBlockpair1, v1); bi.insertVertice(keyBlockpair2, v2); cluster++; caso3++; }**/ // System.err.println( pair.getFirstElement().getAttributeValue("title").toString()); } else { ++nondupCnt; // statistic.addNonDuplicate(pair); } ++cnt; } //bi.printBlockIndex(); // System.err.println(" numero total de elementor " + bi.getNumeroElementos()); //StatisticOutput statisticOutput = new SimpleStatisticOutput(System.out, statistic); // statisticOutput.writeStatistics(); algorithm.cleanUp(); // print statistics // System.out.println(); // System.out.println(); System.err.println( dupCnt + " duplicates out of " + cnt + " pairs detected in " + (System.currentTimeMillis() - start) + " ms " + bi.getNumeroElementos() + " " + bi.getNumeroBlocos()); //System.err.println(" casos " + caso1 + " " + caso2 + " " + caso3 + " "); QueryExperimento query = new QueryExperimento(bi); query.query(); }
From source file:LanguageUsage.java
public void start() throws EncoderException, DecoderException { String word1 = "Wilson"; String word2 = "Wylson"; String foreignWord1 = "Otto"; String foreignWord2 = "Auto"; Soundex sndx = new Soundex(); DoubleMetaphone doubleMetaphone = new DoubleMetaphone(); System.err.println("Soundex Code for Wilson is: " + sndx.encode("Wilson")); System.err.println("Soundex Code for Wylson is: " + sndx.encode("Wylson")); // Use the StringEncoderComparator to compare these two Strings StringEncoderComparator comparator1 = new StringEncoderComparator(sndx); System.err//from www.j a v a 2 s . c o m .println("Are Wilson and Wylson same based on Soundex? " + comparator1.compare("Wilson", "Wylson")); System.err.println("Are Auto and Otto same based on Soundex? " + comparator1.compare("Auto", "Otto")); StringEncoderComparator comparator2 = new StringEncoderComparator(doubleMetaphone); System.err .println("Are Auto and Otto same based on DoubleMetaphone? " + comparator2.compare("Auto", "Otto")); System.err.println( "Double Metaphone primary code for Schmidt: " + doubleMetaphone.doubleMetaphone("Schmidt")); System.err.println( "Double Metaphone secondary code for Schmidt: " + doubleMetaphone.doubleMetaphone("Schmidt", true)); }
From source file:com.vangent.hieos.empi.transform.DoubleMetaphoneTransformFunction.java
/** * /*from w w w . ja v a 2 s . c o m*/ * @param obj * @return */ public Object transform(Object obj) { DoubleMetaphone encoder = new DoubleMetaphone(); return encoder.encode((String) obj); }
From source file:DataStructures.SchemaBasedProfiles.AbstractProfile.java
public AbstractProfile(String url) { entityUrl = url; doubleMetaphone = new DoubleMetaphone(); }
From source file:dkpro.similarity.algorithms.sound.DoubleMetaphoneComparator.java
public DoubleMetaphoneComparator() { encoder = new DoubleMetaphone(); }
From source file:de.tudarmstadt.ukp.dkpro.core.commonscodec.DoubleMetaphonePhoneticTranscriptor.java
public DoubleMetaphonePhoneticTranscriptor() { this.encoder = new DoubleMetaphone(); }
From source file:com.kodemore.text.KmTextUtilities.java
/** * The metaphone alghorithm./*from www . j ava 2s .c om*/ * This improves over the original metaphone. * It uses a much more complex ruleset. * * http://en.wikipedia.org/wiki/Metaphone */ public static String doubleMetaphone(String s) { return new DoubleMetaphone().encode(s); }
From source file:ca.sqlpower.matchmaker.munge.DoubleMetaphoneMungeStep.java
public Boolean doCall() throws Exception { MungeStepOutput<String> out = getOut(); MungeStepOutput<String> in = getMSOInputs().get(0); String data = in.getData();/*from w w w . ja va 2s . c o m*/ if (data != null) { out.setData(new DoubleMetaphone().doubleMetaphone(data, isUseAlternate())); } else { out.setData(null); } return true; }
From source file:com.puppetlabs.geppetto.pp.dsl.contentassist.PPProposalsGenerator.java
/** * Computes attribute proposals where the class/definition name must match exactly, but where * parameters are processed with fuzzy logic. * //from ww w. j a v a2 s . c o m * @param currentName * @param descs * @param searchPath * TODO * @param types * @return */ public String[] computeAttributeProposals(final QualifiedName currentName, Collection<IEObjectDescription> descs, PPSearchPath searchPath) { if (currentName.getSegmentCount() < 2) return new String[0]; final DoubleMetaphone encoder = new DoubleMetaphone(); final String metaphoneName = encoder.encode(currentName.getLastSegment()); Collection<String> proposals = generateAttributeCandidates(currentName, descs, searchPath); // propose all, but sort them based on likeness String[] result = new String[proposals.size()]; proposals.toArray(result); Arrays.sort(result, new PronunciationComparator(encoder, metaphoneName)); return result; }
From source file:com.panet.imeta.core.row.ValueDataUtil.java
public static String get_Double_Metaphone(ValueMetaInterface metaA, Object dataA) { if (dataA == null) return null; return (new DoubleMetaphone()).doubleMetaphone(dataA.toString()); }