Example usage for org.apache.commons.codec.language DoubleMetaphone DoubleMetaphone

List of usage examples for org.apache.commons.codec.language DoubleMetaphone DoubleMetaphone

Introduction

In this page you can find the example usage for org.apache.commons.codec.language DoubleMetaphone DoubleMetaphone.

Prototype

public DoubleMetaphone() 

Source Link

Document

Creates an instance of this DoubleMetaphone encoder

Usage

From source file:experimentos.LevenshteinExperimentCDQuID.java

public static void main(String[] args) throws Exception {

    // enables dynamic data-loading for file-based sorting
    GlobalConfig.getInstance().setInMemoryObjectThreshold(10000);

    // sets the CSV data source
    CSVSource dataSource = new CSVSource("cd", new File("cd.csv"));
    dataSource.enableHeader();//from   w  ww  .ja va  2 s .com
    dataSource.addIdAttributes("pk");

    //   CSVSource goldstandardSource = new CSVSource("goldstandard", new File("cd_gold.csv"));
    //   goldstandardSource.enableHeader();

    // instantiate the gold standard
    // "cddb" is the source identifier
    //GoldStandard goldStandard = new GoldStandard(goldstandardSource);
    //goldStandard.setFirstElementsObjectIdAttributes("disc1_id");
    //goldStandard.setSecondElementsObjectIdAttributes("disc2_id");
    //goldStandard.setSourceIdLiteral("cddb");

    // defines sub-keys that are used to generate the sorting key
    TextBasedSubkey artistSubkey = new TextBasedSubkey("artist");
    artistSubkey.setIgnoredCharactersRegEx(TextBasedSubkey.NO_VOWELS_REGEX);

    DocumentFrequencyPreprocessor dfPreprocessor = new DocumentFrequencyPreprocessor("artist");
    // the key generator uses sub-key selectors to generate a key for each object

    SortingKey sortingKey = new SortingKey();
    sortingKey.addSubkey(artistSubkey);

    Algorithm algorithm = new SortedNeighborhoodMethod(sortingKey, 30);
    algorithm.addPreprocessor(dfPreprocessor);

    // enable in-memory storing
    algorithm.enableInMemoryProcessing();

    // adds the "data" to the algorithm
    algorithm.addDataSource(dataSource);

    // instantiates similarity measure
    //SimilarityFunction similarityFunction = new TFIDFSimilarityFunction(dfPreprocessor, "title");
    SimilarityFunction similarityFunction = new LevenshteinDistanceFunction("artist");
    // DuDeOutput output = new CSVOutput(new File("saida.csv"));

    long start = System.currentTimeMillis();

    // counts the generated object pairs
    int cnt = 0;

    int dupCnt = 0;
    int nondupCnt = 0;

    //Map<String, ArrayList<String>> mapaSimilares = new HashMap<String, ArrayList<String>>();

    BlockIndex bi = new BlockIndex();
    DoubleMetaphone db = new DoubleMetaphone();
    //  StatisticComponent statistic = new StatisticComponent(goldStandard, algorithm);
    for (DuDeObjectPair pair : algorithm) {
        if (similarityFunction.getSimilarity(pair) > 0.8) {
            ++dupCnt;
            String pk1 = pair.getFirstElement().getAttributeValue("pk").toString();
            String pk2 = pair.getSecondElement().getAttributeValue("pk").toString();

            String title1 = pair.getFirstElement().getAttributeValue("artist").toString();
            String title2 = pair.getSecondElement().getAttributeValue("artist").toString();

            String keyBlockpair1 = db.encode(title1);
            String keyBlockpair2 = db.encode(title2);

            //int cluster1 = bi.getclusterId(pk1, keyBlockpair1, "cd");
            //int cluster2 = bi.getclusterId(pk2, keyBlockpair2, "cd");

            Vertice v1 = new Vertice(pk1, "cd", 0);
            Vertice v2 = new Vertice(pk2, "cd", 0);

            bi.insertVertice(keyBlockpair1, v1);
            bi.insertVertice(keyBlockpair2, v2);
            // statistic.addDuplicate(pair);

            /**if ((cluster1 != -1) && (cluster2 == -1)) {
            Vertice v2 = new Vertice(pk2, "cd", cluster1);
            bi.insertVertice(keyBlockpair2, v2);
            caso1++;
                    
            }else if ((cluster1 == -1) && (cluster2 != -1)) {
             Vertice v1 = new Vertice(pk1, "cd", cluster2);
                bi.insertVertice(keyBlockpair1, v1);
                caso2++;
                    
            }  else if ((cluster1 == -1) && (cluster2 == -1)) {
             Vertice v1 = new Vertice(pk1, "cd", cluster);
                Vertice v2 = new Vertice(pk2, "cd", cluster);
                    
                bi.insertVertice(keyBlockpair1, v1);
                bi.insertVertice(keyBlockpair2, v2);
               cluster++;
               caso3++;
            }**/

            // System.err.println(  pair.getFirstElement().getAttributeValue("title").toString());
        } else {
            ++nondupCnt;
            //  statistic.addNonDuplicate(pair);
        }
        ++cnt;
    }
    //bi.printBlockIndex();
    // System.err.println(" numero total de elementor " + bi.getNumeroElementos());
    //StatisticOutput statisticOutput = new SimpleStatisticOutput(System.out, statistic);
    //      statisticOutput.writeStatistics();
    algorithm.cleanUp();

    // print statistics
    //  System.out.println();
    // System.out.println();

    System.err.println(
            dupCnt + " duplicates out of " + cnt + " pairs detected in " + (System.currentTimeMillis() - start)
                    + " ms  " + bi.getNumeroElementos() + "  " + bi.getNumeroBlocos());
    //System.err.println(" casos  " + caso1 + " " + caso2 + " " + caso3 + " ");
    QueryExperimento query = new QueryExperimento(bi);
    query.query();

}

From source file:LanguageUsage.java

public void start() throws EncoderException, DecoderException {

    String word1 = "Wilson";
    String word2 = "Wylson";
    String foreignWord1 = "Otto";
    String foreignWord2 = "Auto";

    Soundex sndx = new Soundex();
    DoubleMetaphone doubleMetaphone = new DoubleMetaphone();

    System.err.println("Soundex Code for Wilson is: " + sndx.encode("Wilson"));
    System.err.println("Soundex Code for Wylson is: " + sndx.encode("Wylson"));

    // Use the StringEncoderComparator to compare these two Strings
    StringEncoderComparator comparator1 = new StringEncoderComparator(sndx);
    System.err//from  www.j a v a 2  s . c  o m
            .println("Are Wilson and Wylson same based on Soundex? " + comparator1.compare("Wilson", "Wylson"));

    System.err.println("Are Auto and Otto same based on Soundex? " + comparator1.compare("Auto", "Otto"));

    StringEncoderComparator comparator2 = new StringEncoderComparator(doubleMetaphone);

    System.err
            .println("Are Auto and Otto same based on DoubleMetaphone? " + comparator2.compare("Auto", "Otto"));

    System.err.println(
            "Double Metaphone primary code for Schmidt: " + doubleMetaphone.doubleMetaphone("Schmidt"));

    System.err.println(
            "Double Metaphone secondary code for Schmidt: " + doubleMetaphone.doubleMetaphone("Schmidt", true));

}

From source file:com.vangent.hieos.empi.transform.DoubleMetaphoneTransformFunction.java

/**
 * /*from w  w w .  ja v a 2  s . c o  m*/
 * @param obj
 * @return
 */
public Object transform(Object obj) {
    DoubleMetaphone encoder = new DoubleMetaphone();
    return encoder.encode((String) obj);
}

From source file:DataStructures.SchemaBasedProfiles.AbstractProfile.java

public AbstractProfile(String url) {
    entityUrl = url;
    doubleMetaphone = new DoubleMetaphone();
}

From source file:dkpro.similarity.algorithms.sound.DoubleMetaphoneComparator.java

public DoubleMetaphoneComparator() {
    encoder = new DoubleMetaphone();
}

From source file:de.tudarmstadt.ukp.dkpro.core.commonscodec.DoubleMetaphonePhoneticTranscriptor.java

public DoubleMetaphonePhoneticTranscriptor() {
    this.encoder = new DoubleMetaphone();
}

From source file:com.kodemore.text.KmTextUtilities.java

/**
 * The metaphone alghorithm./*from www  .  j  ava  2s .c  om*/
 * This improves over the original metaphone.
 * It uses a much more complex ruleset.
 *
 * http://en.wikipedia.org/wiki/Metaphone
 */
public static String doubleMetaphone(String s) {
    return new DoubleMetaphone().encode(s);
}

From source file:ca.sqlpower.matchmaker.munge.DoubleMetaphoneMungeStep.java

public Boolean doCall() throws Exception {
    MungeStepOutput<String> out = getOut();
    MungeStepOutput<String> in = getMSOInputs().get(0);
    String data = in.getData();/*from   w w w . ja  va  2s .  c  o  m*/
    if (data != null) {
        out.setData(new DoubleMetaphone().doubleMetaphone(data, isUseAlternate()));
    } else {
        out.setData(null);
    }
    return true;
}

From source file:com.puppetlabs.geppetto.pp.dsl.contentassist.PPProposalsGenerator.java

/**
 * Computes attribute proposals where the class/definition name must match exactly, but where
 * parameters are processed with fuzzy logic.
 * //from  ww w. j a  v a2 s  .  c  o  m
 * @param currentName
 * @param descs
 * @param searchPath
 *            TODO
 * @param types
 * @return
 */
public String[] computeAttributeProposals(final QualifiedName currentName,
        Collection<IEObjectDescription> descs, PPSearchPath searchPath) {
    if (currentName.getSegmentCount() < 2)
        return new String[0];

    final DoubleMetaphone encoder = new DoubleMetaphone();
    final String metaphoneName = encoder.encode(currentName.getLastSegment());

    Collection<String> proposals = generateAttributeCandidates(currentName, descs, searchPath);
    // propose all, but sort them based on likeness

    String[] result = new String[proposals.size()];
    proposals.toArray(result);
    Arrays.sort(result, new PronunciationComparator(encoder, metaphoneName));
    return result;
}

From source file:com.panet.imeta.core.row.ValueDataUtil.java

public static String get_Double_Metaphone(ValueMetaInterface metaA, Object dataA) {
    if (dataA == null)
        return null;
    return (new DoubleMetaphone()).doubleMetaphone(dataA.toString());
}