Example usage for org.apache.commons.codec.language DoubleMetaphone encode

Introduction

In this page you can find the example usage for org.apache.commons.codec.language DoubleMetaphone encode.

Prototype

public String encode(String value)

Source Link

Document

Encode the value using DoubleMetaphone.

Usage

From source file:experimentos.LevenshteinExperimentCDQuID.java

public static void main(String[] args) throws Exception {

    // enables dynamic data-loading for file-based sorting
    GlobalConfig.getInstance().setInMemoryObjectThreshold(10000);

    // sets the CSV data source
    CSVSource dataSource = new CSVSource("cd", new File("cd.csv"));
    dataSource.enableHeader();//from  ww  w. j a v  a  2 s . co m
    dataSource.addIdAttributes("pk");

    //   CSVSource goldstandardSource = new CSVSource("goldstandard", new File("cd_gold.csv"));
    //   goldstandardSource.enableHeader();

    // instantiate the gold standard
    // "cddb" is the source identifier
    //GoldStandard goldStandard = new GoldStandard(goldstandardSource);
    //goldStandard.setFirstElementsObjectIdAttributes("disc1_id");
    //goldStandard.setSecondElementsObjectIdAttributes("disc2_id");
    //goldStandard.setSourceIdLiteral("cddb");

    // defines sub-keys that are used to generate the sorting key
    TextBasedSubkey artistSubkey = new TextBasedSubkey("artist");
    artistSubkey.setIgnoredCharactersRegEx(TextBasedSubkey.NO_VOWELS_REGEX);

    DocumentFrequencyPreprocessor dfPreprocessor = new DocumentFrequencyPreprocessor("artist");
    // the key generator uses sub-key selectors to generate a key for each object

    SortingKey sortingKey = new SortingKey();
    sortingKey.addSubkey(artistSubkey);

    Algorithm algorithm = new SortedNeighborhoodMethod(sortingKey, 30);
    algorithm.addPreprocessor(dfPreprocessor);

    // enable in-memory storing
    algorithm.enableInMemoryProcessing();

    // adds the "data" to the algorithm
    algorithm.addDataSource(dataSource);

    // instantiates similarity measure
    //SimilarityFunction similarityFunction = new TFIDFSimilarityFunction(dfPreprocessor, "title");
    SimilarityFunction similarityFunction = new LevenshteinDistanceFunction("artist");
    // DuDeOutput output = new CSVOutput(new File("saida.csv"));

    long start = System.currentTimeMillis();

    // counts the generated object pairs
    int cnt = 0;

    int dupCnt = 0;
    int nondupCnt = 0;

    //Map<String, ArrayList<String>> mapaSimilares = new HashMap<String, ArrayList<String>>();

    BlockIndex bi = new BlockIndex();
    DoubleMetaphone db = new DoubleMetaphone();
    //  StatisticComponent statistic = new StatisticComponent(goldStandard, algorithm);
    for (DuDeObjectPair pair : algorithm) {
        if (similarityFunction.getSimilarity(pair) > 0.8) {
            ++dupCnt;
            String pk1 = pair.getFirstElement().getAttributeValue("pk").toString();
            String pk2 = pair.getSecondElement().getAttributeValue("pk").toString();

            String title1 = pair.getFirstElement().getAttributeValue("artist").toString();
            String title2 = pair.getSecondElement().getAttributeValue("artist").toString();

            String keyBlockpair1 = db.encode(title1);
            String keyBlockpair2 = db.encode(title2);

            //int cluster1 = bi.getclusterId(pk1, keyBlockpair1, "cd");
            //int cluster2 = bi.getclusterId(pk2, keyBlockpair2, "cd");

            Vertice v1 = new Vertice(pk1, "cd", 0);
            Vertice v2 = new Vertice(pk2, "cd", 0);

            bi.insertVertice(keyBlockpair1, v1);
            bi.insertVertice(keyBlockpair2, v2);
            // statistic.addDuplicate(pair);

            /**if ((cluster1 != -1) && (cluster2 == -1)) {
            Vertice v2 = new Vertice(pk2, "cd", cluster1);
            bi.insertVertice(keyBlockpair2, v2);
            caso1++;
                    
            }else if ((cluster1 == -1) && (cluster2 != -1)) {
             Vertice v1 = new Vertice(pk1, "cd", cluster2);
                bi.insertVertice(keyBlockpair1, v1);
                caso2++;
                    
            }  else if ((cluster1 == -1) && (cluster2 == -1)) {
             Vertice v1 = new Vertice(pk1, "cd", cluster);
                Vertice v2 = new Vertice(pk2, "cd", cluster);
                    
                bi.insertVertice(keyBlockpair1, v1);
                bi.insertVertice(keyBlockpair2, v2);
               cluster++;
               caso3++;
            }**/

            // System.err.println(  pair.getFirstElement().getAttributeValue("title").toString());
        } else {
            ++nondupCnt;
            //  statistic.addNonDuplicate(pair);
        }
        ++cnt;
    }
    //bi.printBlockIndex();
    // System.err.println(" numero total de elementor " + bi.getNumeroElementos());
    //StatisticOutput statisticOutput = new SimpleStatisticOutput(System.out, statistic);
    //      statisticOutput.writeStatistics();
    algorithm.cleanUp();

    // print statistics
    //  System.out.println();
    // System.out.println();

    System.err.println(
            dupCnt + " duplicates out of " + cnt + " pairs detected in " + (System.currentTimeMillis() - start)
                    + " ms  " + bi.getNumeroElementos() + "  " + bi.getNumeroBlocos());
    //System.err.println(" casos  " + caso1 + " " + caso2 + " " + caso3 + " ");
    QueryExperimento query = new QueryExperimento(bi);
    query.query();

}

From source file:it.univpm.deit.semedia.musicuri.core.Toolset.java

/**
 * Genarates a list of terms that are the metaphone equivalents of the words in the given list.
 * The terms are generated using the double metaphone phonetic maching algorithm (apache implementation)
 * @param keywords an aArrayList object containing the keywords to generate metaphones for 
 * @return an aArrayList object containing the generated metaphone equivalent terms
 *//* www. ja va 2s.  co  m*/
public static ArrayList GenerateMetaphones(ArrayList keywords) {
    ArrayList metaphoneList = new ArrayList(keywords.size());
    DoubleMetaphone meta = new DoubleMetaphone();
    String tmp = null;

    for (int i = 0; i < keywords.size(); i++) {
        tmp = meta.encode((String) keywords.get(i));
        metaphoneList.add(tmp);
    }
    return metaphoneList;
}

From source file:com.vangent.hieos.empi.transform.DoubleMetaphoneTransformFunction.java

/**
 * /*from w  w w  . jav a2s  .  c  om*/
 * @param obj
 * @return
 */
public Object transform(Object obj) {
    DoubleMetaphone encoder = new DoubleMetaphone();
    return encoder.encode((String) obj);
}

From source file:com.puppetlabs.geppetto.pp.dsl.contentassist.PPProposalsGenerator.java

/**
 * Computes attribute proposals where the class/definition name must match exactly, but where
 * parameters are processed with fuzzy logic.
 * //from   w  ww  . j  a v a  2s.  co m
 * @param currentName
 * @param descs
 * @param searchPath
 *            TODO
 * @param types
 * @return
 */
public String[] computeAttributeProposals(final QualifiedName currentName,
        Collection<IEObjectDescription> descs, PPSearchPath searchPath) {
    if (currentName.getSegmentCount() < 2)
        return new String[0];

    final DoubleMetaphone encoder = new DoubleMetaphone();
    final String metaphoneName = encoder.encode(currentName.getLastSegment());

    Collection<String> proposals = generateAttributeCandidates(currentName, descs, searchPath);
    // propose all, but sort them based on likeness

    String[] result = new String[proposals.size()];
    proposals.toArray(result);
    Arrays.sort(result, new PronunciationComparator(encoder, metaphoneName));
    return result;
}

From source file:com.puppetlabs.geppetto.pp.dsl.contentassist.PPProposalsGenerator.java

/**
 * Attempts to produce a list of names that are close to the given name. At most 5 proposals
 * are generated. The returned proposals are made in order of "pronunciation distance" which is
 * obtained by taking the Levenshtein distance between the Double Monophone encodings of
 * candidate and given name. Candidates are selected as the names with shortest Levenshtein distance
 * and names that are Monophonically equal, or starts or ends monophonically.
 * /*ww w.ja  va2  s  . co m*/
 * @param currentName
 *            the name for which proposals are to be generated
 * @param descs
 *            the descriptors of available named values
 * @param searchPath
 *            TODO
 * @param types
 *            if stated, the wanted types of named values
 * @return
 *         array of proposals, possibly empty, but never null.
 */
public String[] computeProposals(final String currentName, Collection<IEObjectDescription> descs,
        boolean upperCaseProposals, PPSearchPath searchPath, EClass... types) {
    if (currentName == null || currentName.length() < 1)
        return new String[0];

    // compute the 5 best matches and only accept if score <= 5
    ScoreKeeper<IEObjectDescription> tracker = new ScoreKeeper<IEObjectDescription>(5, false, 5);
    // List<IEObjectDescription> metaphoneAlike = Lists.newArrayList();
    final DoubleMetaphone encoder = new DoubleMetaphone();
    final String metaphoneName = encoder.encode(currentName);

    for (IEObjectDescription d : descs) {
        EClass c = d.getEClass();
        typeok: if (types != null && types.length > 0) {
            for (EClass wanted : types)
                if ((wanted == c || wanted.isSuperTypeOf(c)))
                    break typeok;
            continue;
        }
        // filter based on path visibility
        if (searchPath.searchIndexOf(d) == -1)
            continue; // not visible according to path

        String candidateName = converter.toString(d.getName());
        tracker.addScore(StringUtils.getLevenshteinDistance(currentName, candidateName), d);
        String candidateMetaphone = encoder.encode(candidateName);
        // metaphone matches are scored on the pronounciation distance
        if (metaphoneName.equals(candidateMetaphone) //
                || candidateMetaphone.startsWith(metaphoneName) //
                || candidateMetaphone.endsWith(metaphoneName) //
        )
            tracker.addScore(StringUtils.getLevenshteinDistance(metaphoneName, candidateMetaphone), d);
        // System.err.printf("Metaphone alike: %s == %s\n", currentName, candidateName);
    }
    List<String> result = Lists.newArrayList();
    // System.err.print("Scores = ");
    for (ScoreEntry<IEObjectDescription> entry : tracker.getScoreEntries()) {
        String s = converter.toString(entry.getData().getName());
        result.add(s);
        // System.err.printf("%d %s, ", entry.getScore(), s);
    }
    // System.err.println();

    String[] proposals = result.toArray(new String[result.size()]);

    PronunciationComparator x = new PronunciationComparator(encoder, metaphoneName);

    Arrays.sort(proposals, x);
    // System.err.print("Order = ");
    // for(int i = 0; i < proposals.length; i++)
    // System.err.printf("%s, ", proposals[i]);
    // System.err.println();
    return upperCaseProposals ? toUpperCaseProposals(proposals) : proposals;
}

From source file:org.openregistry.core.domain.AbstractNameImpl.java

protected final String generateSoundEx(final String comparison) {
    final DoubleMetaphone dmp = new DoubleMetaphone();
    return dmp.encode(comparison);
}

From source file:org.vivoweb.harvester.score.algorithm.NormalizedDoubleMetaphoneDifference.java

@Override
public float calculate(CharSequence itemX, CharSequence itemY) {
    if (itemX.length() == 0 || itemY.length() == 0) {
        return 0f;
    }/*from  w ww. j av  a  2s  .  c o m*/
    DoubleMetaphone dm = new DoubleMetaphone();
    String dmX = dm.encode(itemX.toString());
    String dmY = dm.encode(itemY.toString());
    return new NormalizedLevenshteinDifference().calculate(dmX, dmY);
}

From source file:query.Amostra.java

public BlockIndex blocaDadosDaAmostraConsulta(CSVSource dataSource) {

    BlockIndex bi2 = new BlockIndex();
    //SoundEx db = new SoundEx();

    DoubleMetaphone db = new DoubleMetaphone();
    for (Iterator<DuDeObject> iterator = dataSource.iterator(); iterator.hasNext();) {
        DuDeObject next = iterator.next();
        if (encontraKeySelecionada(next.getAttributeValue("key").toString())) {
            String pk = next.getAttributeValue("pk").toString();
            String block = next.getAttributeValue("title").toString();
            String block2 = next.getAttributeValue("artist").toString();

            String keyBlock = db.encode(block);
            //String keyBlock = db.getSoundEx(block2);
            Vertice v1 = new Vertice(pk, "cd", -1, block, block2);
            bi2.insertVertice(keyBlock, v1);
            this.dadosEntrada.add(v1);
            //   System.out.println(" Pegou Id   " + next.getAttributeValue("title").toString() );
        }//from w w  w.  j ava2s .  co m
    }

    return bi2;

}

From source file:query.QueryExperimento.java

public void query() throws FileNotFoundException {
    GlobalConfig.getInstance().setInMemoryObjectThreshold(1000);

    // sets the CSV data source
    CSVSource dataSource = new CSVSource("cd", new File("cd.csv"));
    dataSource.enableHeader();//  w w  w  . j a v a 2 s . com
    dataSource.addIdAttributes("pk");
    long start = System.currentTimeMillis();

    DoubleMetaphone db = new DoubleMetaphone();
    int achou = 0;
    int nAchou = 0;
    int total = 0;

    for (Iterator<DuDeObject> iterator = dataSource.iterator(); iterator.hasNext();) {
        DuDeObject next = iterator.next();

        String pk = next.getAttributeValue("pk").toString();
        String block = next.getAttributeValue("artist").toString();

        String keyBlock = db.encode(block);

        boolean clusterId = bi.getId(pk, keyBlock, "cd");
        total++;
        if (clusterId) {
            achou++;

        } else {
            nAchou++;

        }

    }
    System.err.println((System.currentTimeMillis() - start) + " ms");
    System.err.println("total " + total + " n achou " + nAchou + "Achou " + achou);
}

From source file:query.QueryExperimento.java

/**
* 
* @param dataSource/*from   ww w .j  a  va 2 s . co m*/
* @param tamanho porcentagem de elementos que n]ao se deseja guardar informaes
* @return as tuplas que se deseja ter informaes em um blooco que sera processado
*/
public BlockIndex blocaConsultaReduzidaFixa(CSVSource dataSource, int tamanho) {

    BlockIndex bi2 = new BlockIndex();
    DoubleMetaphone db = new DoubleMetaphone();
    int numeroElementos = 0;

    for (Iterator<DuDeObject> iterator = dataSource.iterator(); iterator.hasNext();) {
        DuDeObject next = iterator.next();
        if (numeroElementos < tamanho) {
            String pk = next.getAttributeValue("pk").toString();
            String block = next.getAttributeValue("title").toString();
            String block2 = next.getAttributeValue("artist").toString();

            String keyBlock = db.encode(block);
            Vertice v1 = new Vertice(pk, "cd", -1, block, block2);
            bi2.insertVertice(keyBlock, v1);
            numeroElementos++;

        } else {

            numeroElementos++;
        }

    }
    System.out.println(" Tamanho bloco " + bi2.getNumeroElementos());
    return bi2;

}