public String encode(String value) 

Encode the value using DoubleMetaphone.


From source file:experimentos.LevenshteinExperimentCDQuID.java

public static void main(String[] args) throws Exception {

    // enables dynamic data-loading for file-based sorting

    // sets the CSV data source
    CSVSource dataSource = new CSVSource("cd", new File("cd.csv"));
    dataSource.enableHeader();//from  ww  w. j a v  a  2 s . co m

    //   CSVSource goldstandardSource = new CSVSource("goldstandard", new File("cd_gold.csv"));
    //   goldstandardSource.enableHeader();

    // instantiate the gold standard
    // "cddb" is the source identifier
    //GoldStandard goldStandard = new GoldStandard(goldstandardSource);

    // defines sub-keys that are used to generate the sorting key
    TextBasedSubkey artistSubkey = new TextBasedSubkey("artist");

    DocumentFrequencyPreprocessor dfPreprocessor = new DocumentFrequencyPreprocessor("artist");
    // the key generator uses sub-key selectors to generate a key for each object

    SortingKey sortingKey = new SortingKey();

    Algorithm algorithm = new SortedNeighborhoodMethod(sortingKey, 30);

    // enable in-memory storing

    // adds the "data" to the algorithm

    // instantiates similarity measure
    //SimilarityFunction similarityFunction = new TFIDFSimilarityFunction(dfPreprocessor, "title");
    SimilarityFunction similarityFunction = new LevenshteinDistanceFunction("artist");
    // DuDeOutput output = new CSVOutput(new File("saida.csv"));

    long start = System.currentTimeMillis();

    // counts the generated object pairs
    int cnt = 0;

    int dupCnt = 0;
    int nondupCnt = 0;

    //Map<String, ArrayList<String>> mapaSimilares = new HashMap<String, ArrayList<String>>();

    BlockIndex bi = new BlockIndex();
    DoubleMetaphone db = new DoubleMetaphone();
    //  StatisticComponent statistic = new StatisticComponent(goldStandard, algorithm);
    for (DuDeObjectPair pair : algorithm) {
        if (similarityFunction.getSimilarity(pair) > 0.8) {
            String pk1 = pair.getFirstElement().getAttributeValue("pk").toString();
            String pk2 = pair.getSecondElement().getAttributeValue("pk").toString();

            String title1 = pair.getFirstElement().getAttributeValue("artist").toString();
            String title2 = pair.getSecondElement().getAttributeValue("artist").toString();

            String keyBlockpair1 = db.encode(title1);
            String keyBlockpair2 = db.encode(title2);

            //int cluster1 = bi.getclusterId(pk1, keyBlockpair1, "cd");
            //int cluster2 = bi.getclusterId(pk2, keyBlockpair2, "cd");

            Vertice v1 = new Vertice(pk1, "cd", 0);
            Vertice v2 = new Vertice(pk2, "cd", 0);

            bi.insertVertice(keyBlockpair1, v1);
            bi.insertVertice(keyBlockpair2, v2);
            // statistic.addDuplicate(pair);

            /**if ((cluster1 != -1) && (cluster2 == -1)) {
            Vertice v2 = new Vertice(pk2, "cd", cluster1);
            bi.insertVertice(keyBlockpair2, v2);
            }else if ((cluster1 == -1) && (cluster2 != -1)) {
             Vertice v1 = new Vertice(pk1, "cd", cluster2);
                bi.insertVertice(keyBlockpair1, v1);
            }  else if ((cluster1 == -1) && (cluster2 == -1)) {
             Vertice v1 = new Vertice(pk1, "cd", cluster);
                Vertice v2 = new Vertice(pk2, "cd", cluster);
                bi.insertVertice(keyBlockpair1, v1);
                bi.insertVertice(keyBlockpair2, v2);

            // System.err.println(  pair.getFirstElement().getAttributeValue("title").toString());
        } else {
            //  statistic.addNonDuplicate(pair);
    // System.err.println(" numero total de elementor " + bi.getNumeroElementos());
    //StatisticOutput statisticOutput = new SimpleStatisticOutput(System.out, statistic);
    //      statisticOutput.writeStatistics();

    // print statistics
    //  System.out.println();
    // System.out.println();

            dupCnt + " duplicates out of " + cnt + " pairs detected in " + (System.currentTimeMillis() - start)
                    + " ms  " + bi.getNumeroElementos() + "  " + bi.getNumeroBlocos());
    //System.err.println(" casos  " + caso1 + " " + caso2 + " " + caso3 + " ");
    QueryExperimento query = new QueryExperimento(bi);


From source file:it.univpm.deit.semedia.musicuri.core.Toolset.java

 * Genarates a list of terms that are the metaphone equivalents of the words in the given list.
 * The terms are generated using the double metaphone phonetic maching algorithm (apache implementation)
 * @param keywords an aArrayList object containing the keywords to generate metaphones for 
 * @return an aArrayList object containing the generated metaphone equivalent terms
 *//* www. ja va 2s.  co  m*/
public static ArrayList GenerateMetaphones(ArrayList keywords) {
    ArrayList metaphoneList = new ArrayList(keywords.size());
    DoubleMetaphone meta = new DoubleMetaphone();
    String tmp = null;

    for (int i = 0; i < keywords.size(); i++) {
        tmp = meta.encode((String) keywords.get(i));
    return metaphoneList;

From source file:com.vangent.hieos.empi.transform.DoubleMetaphoneTransformFunction.java

 * /*from w  w w  . jav a2s  .  c  om*/
 * @param obj
 * @return
public Object transform(Object obj) {
    DoubleMetaphone encoder = new DoubleMetaphone();
    return encoder.encode((String) obj);

From source file:com.puppetlabs.geppetto.pp.dsl.contentassist.PPProposalsGenerator.java

 * Computes attribute proposals where the class/definition name must match exactly, but where
 * parameters are processed with fuzzy logic.
 * //from   w  ww  . j  a v a  2s.  co m
 * @param currentName
 * @param descs
 * @param searchPath
 *            TODO
 * @param types
 * @return
public String[] computeAttributeProposals(final QualifiedName currentName,
        Collection<IEObjectDescription> descs, PPSearchPath searchPath) {
    if (currentName.getSegmentCount() < 2)
        return new String[0];

    final DoubleMetaphone encoder = new DoubleMetaphone();
    final String metaphoneName = encoder.encode(currentName.getLastSegment());

    Collection<String> proposals = generateAttributeCandidates(currentName, descs, searchPath);
    // propose all, but sort them based on likeness

    String[] result = new String[proposals.size()];
    Arrays.sort(result, new PronunciationComparator(encoder, metaphoneName));
    return result;

From source file:com.puppetlabs.geppetto.pp.dsl.contentassist.PPProposalsGenerator.java

 * Attempts to produce a list of names that are close to the given name. At most 5 proposals
 * are generated. The returned proposals are made in order of "pronunciation distance" which is
 * obtained by taking the Levenshtein distance between the Double Monophone encodings of
 * candidate and given name. Candidates are selected as the names with shortest Levenshtein distance
 * and names that are Monophonically equal, or starts or ends monophonically.
 * /*ww w.ja  va2  s  . co m*/
 * @param currentName
 *            the name for which proposals are to be generated
 * @param descs
 *            the descriptors of available named values
 * @param searchPath
 *            TODO
 * @param types
 *            if stated, the wanted types of named values
 * @return
 *         array of proposals, possibly empty, but never null.
public String[] computeProposals(final String currentName, Collection<IEObjectDescription> descs,
        boolean upperCaseProposals, PPSearchPath searchPath, EClass... types) {
    if (currentName == null || currentName.length() < 1)
        return new String[0];

    // compute the 5 best matches and only accept if score <= 5
    ScoreKeeper<IEObjectDescription> tracker = new ScoreKeeper<IEObjectDescription>(5, false, 5);
    // List<IEObjectDescription> metaphoneAlike = Lists.newArrayList();
    final DoubleMetaphone encoder = new DoubleMetaphone();
    final String metaphoneName = encoder.encode(currentName);

    for (IEObjectDescription d : descs) {
        EClass c = d.getEClass();
        typeok: if (types != null && types.length > 0) {
            for (EClass wanted : types)
                if ((wanted == c || wanted.isSuperTypeOf(c)))
                    break typeok;
        // filter based on path visibility
        if (searchPath.searchIndexOf(d) == -1)
            continue; // not visible according to path

        String candidateName = converter.toString(d.getName());
        tracker.addScore(StringUtils.getLevenshteinDistance(currentName, candidateName), d);
        String candidateMetaphone = encoder.encode(candidateName);
        // metaphone matches are scored on the pronounciation distance
        if (metaphoneName.equals(candidateMetaphone) //
                || candidateMetaphone.startsWith(metaphoneName) //
                || candidateMetaphone.endsWith(metaphoneName) //
            tracker.addScore(StringUtils.getLevenshteinDistance(metaphoneName, candidateMetaphone), d);
        // System.err.printf("Metaphone alike: %s == %s\n", currentName, candidateName);
    List<String> result = Lists.newArrayList();
    // System.err.print("Scores = ");
    for (ScoreEntry<IEObjectDescription> entry : tracker.getScoreEntries()) {
        String s = converter.toString(entry.getData().getName());
        // System.err.printf("%d %s, ", entry.getScore(), s);
    // System.err.println();

    String[] proposals = result.toArray(new String[result.size()]);

    PronunciationComparator x = new PronunciationComparator(encoder, metaphoneName);

    Arrays.sort(proposals, x);
    // System.err.print("Order = ");
    // for(int i = 0; i < proposals.length; i++)
    // System.err.printf("%s, ", proposals[i]);
    // System.err.println();
    return upperCaseProposals ? toUpperCaseProposals(proposals) : proposals;

From source file:org.openregistry.core.domain.AbstractNameImpl.java

protected final String generateSoundEx(final String comparison) {
    final DoubleMetaphone dmp = new DoubleMetaphone();
    return dmp.encode(comparison);

From source file:org.vivoweb.harvester.score.algorithm.NormalizedDoubleMetaphoneDifference.java

public float calculate(CharSequence itemX, CharSequence itemY) {
    if (itemX.length() == 0 || itemY.length() == 0) {
        return 0f;
    }/*from  w ww. j av  a  2s  .  c o m*/
    DoubleMetaphone dm = new DoubleMetaphone();
    String dmX = dm.encode(itemX.toString());
    String dmY = dm.encode(itemY.toString());
    return new NormalizedLevenshteinDifference().calculate(dmX, dmY);

From source file:query.Amostra.java

public BlockIndex blocaDadosDaAmostraConsulta(CSVSource dataSource) {

    BlockIndex bi2 = new BlockIndex();
    //SoundEx db = new SoundEx();

    DoubleMetaphone db = new DoubleMetaphone();
    for (Iterator<DuDeObject> iterator = dataSource.iterator(); iterator.hasNext();) {
        DuDeObject next = iterator.next();
        if (encontraKeySelecionada(next.getAttributeValue("key").toString())) {
            String pk = next.getAttributeValue("pk").toString();
            String block = next.getAttributeValue("title").toString();
            String block2 = next.getAttributeValue("artist").toString();

            String keyBlock = db.encode(block);
            //String keyBlock = db.getSoundEx(block2);
            Vertice v1 = new Vertice(pk, "cd", -1, block, block2);
            bi2.insertVertice(keyBlock, v1);
            //   System.out.println(" Pegou Id   " + next.getAttributeValue("title").toString() );
        }//from w w  w.  j ava2s .  co m

    return bi2;


From source file:query.QueryExperimento.java

public void query() throws FileNotFoundException {

    // sets the CSV data source
    CSVSource dataSource = new CSVSource("cd", new File("cd.csv"));
    dataSource.enableHeader();//  w w  w  . j a v a 2 s . com
    long start = System.currentTimeMillis();

    DoubleMetaphone db = new DoubleMetaphone();
    int achou = 0;
    int nAchou = 0;
    int total = 0;

    for (Iterator<DuDeObject> iterator = dataSource.iterator(); iterator.hasNext();) {
        DuDeObject next = iterator.next();

        String pk = next.getAttributeValue("pk").toString();
        String block = next.getAttributeValue("artist").toString();

        String keyBlock = db.encode(block);

        boolean clusterId = bi.getId(pk, keyBlock, "cd");
        if (clusterId) {

        } else {


    System.err.println((System.currentTimeMillis() - start) + " ms");
    System.err.println("total " + total + " n achou " + nAchou + "Achou " + achou);

From source file:query.QueryExperimento.java

* @param dataSource/*from   ww w .j  a  va 2 s . co m*/
* @param tamanho porcentagem de elementos que n]ao se deseja guardar informaes
* @return as tuplas que se deseja ter informaes em um blooco que sera processado
public BlockIndex blocaConsultaReduzidaFixa(CSVSource dataSource, int tamanho) {

    BlockIndex bi2 = new BlockIndex();
    DoubleMetaphone db = new DoubleMetaphone();
    int numeroElementos = 0;

    for (Iterator<DuDeObject> iterator = dataSource.iterator(); iterator.hasNext();) {
        DuDeObject next = iterator.next();
        if (numeroElementos < tamanho) {
            String pk = next.getAttributeValue("pk").toString();
            String block = next.getAttributeValue("title").toString();
            String block2 = next.getAttributeValue("artist").toString();

            String keyBlock = db.encode(block);
            Vertice v1 = new Vertice(pk, "cd", -1, block, block2);
            bi2.insertVertice(keyBlock, v1);

        } else {


    System.out.println(" Tamanho bloco " + bi2.getNumeroElementos());
    return bi2;
