List of usage examples for org.apache.commons.codec.language Caverphone2 Caverphone2
Caverphone2
From source file:com.vangent.hieos.empi.transform.Caverphone2TransformFunction.java
/** * /*from w w w . j a v a 2 s .c o m*/ * @param obj * @return */ public Object transform(Object obj) { Caverphone2 encoder = new Caverphone2(); return encoder.encode((String) obj); }
From source file:com.example.PhoneticTokenFilterFactory.java
@Inject public PhoneticTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) {//from ww w.jav a 2s. co m super(index, indexSettingsService.getSettings(), name, settings); this.languageset = null; this.nametype = null; this.ruletype = null; this.maxcodelength = 0; this.replace = settings.getAsBoolean("replace", true); // weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default String encodername = settings.get("encoder", "metaphone"); if ("metaphone".equalsIgnoreCase(encodername)) { this.encoder = new Metaphone(); } else if ("soundex".equalsIgnoreCase(encodername)) { this.encoder = new Soundex(); } else if ("caverphone1".equalsIgnoreCase(encodername)) { this.encoder = new Caverphone1(); } else if ("caverphone2".equalsIgnoreCase(encodername)) { this.encoder = new Caverphone2(); } else if ("caverphone".equalsIgnoreCase(encodername)) { this.encoder = new Caverphone2(); } else if ("refined_soundex".equalsIgnoreCase(encodername) || "refinedSoundex".equalsIgnoreCase(encodername)) { this.encoder = new RefinedSoundex(); } else if ("cologne".equalsIgnoreCase(encodername)) { this.encoder = new ColognePhonetic(); } else if ("double_metaphone".equalsIgnoreCase(encodername) || "doubleMetaphone".equalsIgnoreCase(encodername)) { this.encoder = null; this.maxcodelength = settings.getAsInt("max_code_len", 4); } else if ("bm".equalsIgnoreCase(encodername) || "beider_morse".equalsIgnoreCase(encodername) || "beidermorse".equalsIgnoreCase(encodername)) { this.encoder = null; this.languageset = settings.getAsArray("languageset"); String ruleType = settings.get("rule_type", "approx"); if ("approx".equalsIgnoreCase(ruleType)) { ruletype = RuleType.APPROX; } else if ("exact".equalsIgnoreCase(ruleType)) { ruletype = RuleType.EXACT; } else { throw new IllegalArgumentException( "No matching rule type [" + ruleType + "] for beider morse encoder"); } String nameType = settings.get("name_type", "generic"); if ("GENERIC".equalsIgnoreCase(nameType)) { nametype = NameType.GENERIC; } else if ("ASHKENAZI".equalsIgnoreCase(nameType)) { nametype = NameType.ASHKENAZI; } else if ("SEPHARDIC".equalsIgnoreCase(nameType)) { nametype = NameType.SEPHARDIC; } } else if ("koelnerphonetik".equalsIgnoreCase(encodername)) { this.encoder = new KoelnerPhonetik(); } else if ("haasephonetik".equalsIgnoreCase(encodername)) { this.encoder = new HaasePhonetik(); } else if ("nysiis".equalsIgnoreCase(encodername)) { this.encoder = new Nysiis(); } else if ("daitch_mokotoff".equalsIgnoreCase(encodername)) { this.encoder = new DaitchMokotoffSoundex(); } else { throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter"); } }
From source file:com.jaeksoft.searchlib.analysis.filter.PhoneticFilter.java
@Override public TokenStream create(TokenStream tokenStream) { if (BEIDER_MORSE.equals(codec)) return new BeiderMorseTokenFilter(tokenStream, new EncoderKey(ruleType, maxPhonemes)); if (COLOGNE_PHONETIC.equals(codec)) return new EncoderTokenFilter(tokenStream, new ColognePhonetic()); if (SOUNDEX.equals(codec)) return new EncoderTokenFilter(tokenStream, new Soundex()); if (REFINED_SOUNDEX.equals(codec)) return new EncoderTokenFilter(tokenStream, new RefinedSoundex()); if (METAPHONE.equals(codec)) return new EncoderTokenFilter(tokenStream, new Metaphone()); if (CAVERPHONE1.equals(codec)) return new EncoderTokenFilter(tokenStream, new Caverphone1()); if (CAVERPHONE2.equals(codec)) return new EncoderTokenFilter(tokenStream, new Caverphone2()); return null;// ww w . j a v a 2 s. c o m }
From source file:org.apache.lucene.analysis.phonetic.TestPhoneticFilter.java
public void testAlgorithms() throws Exception { assertAlgorithm(new Metaphone(), true, "aaa bbb ccc easgasg", new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" }); assertAlgorithm(new Metaphone(), false, "aaa bbb ccc easgasg", new String[] { "A", "B", "KKK", "ESKS" }); assertAlgorithm(new DoubleMetaphone(), true, "aaa bbb ccc easgasg", new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" }); assertAlgorithm(new DoubleMetaphone(), false, "aaa bbb ccc easgasg", new String[] { "A", "PP", "KK", "ASKS" }); assertAlgorithm(new Soundex(), true, "aaa bbb ccc easgasg", new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" }); assertAlgorithm(new Soundex(), false, "aaa bbb ccc easgasg", new String[] { "A000", "B000", "C000", "E220" }); assertAlgorithm(new RefinedSoundex(), true, "aaa bbb ccc easgasg", new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" }); assertAlgorithm(new RefinedSoundex(), false, "aaa bbb ccc easgasg", new String[] { "A0", "B1", "C3", "E034034" }); assertAlgorithm(new Caverphone2(), true, "Darda Karleen Datha Carlene", new String[] { "TTA1111111", "Darda", "KLN1111111", "Karleen", "TTA1111111", "Datha", "KLN1111111", "Carlene" }); assertAlgorithm(new Caverphone2(), false, "Darda Karleen Datha Carlene", new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" }); assertAlgorithm(new Nysiis(), true, "aaa bbb ccc easgasg", new String[] { "A", "aaa", "B", "bbb", "C", "ccc", "EASGAS", "easgasg" }); assertAlgorithm(new Nysiis(), false, "aaa bbb ccc easgasg", new String[] { "A", "B", "C", "EASGAS" }); }
From source file:org.apache.lucene.analysis.phonetic.TestPhoneticFilter.java
/** blast some random strings through the analyzer */ public void testRandomStrings() throws IOException { Encoder encoders[] = new Encoder[] { new Metaphone(), new DoubleMetaphone(), new Soundex(), new RefinedSoundex(), new Caverphone2() }; for (final Encoder e : encoders) { Analyzer a = new Analyzer() { @Override/* w w w . j a v a2s . com*/ protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false)); } }; checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER); a.close(); Analyzer b = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false)); } }; checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER); b.close(); } }
From source file:org.apache.lucene.analysis.phonetic.TestPhoneticFilter.java
public void testEmptyTerm() throws IOException { Encoder encoders[] = new Encoder[] { new Metaphone(), new DoubleMetaphone(), new Soundex(), new RefinedSoundex(), new Caverphone2() }; for (final Encoder e : encoders) { Analyzer a = new Analyzer() { @Override//from ww w . j av a 2 s . c o m protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, random().nextBoolean())); } }; checkOneTerm(a, "", ""); a.close(); } }
From source file:org.elasticsearch.index.analysis.phonetic.PhoneticTokenFilterFactory.java
@Inject public PhoneticTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {/* ww w . j a va 2 s .c o m*/ super(index, indexSettings, name, settings); this.replace = settings.getAsBoolean("replace", true); String encoder = settings.get("encoder"); if (encoder == null) { throw new ElasticSearchIllegalArgumentException("encoder must be set on phonetic token filter"); } if ("metaphone".equalsIgnoreCase(encoder)) { this.encoder = new Metaphone(); } else if ("soundex".equalsIgnoreCase(encoder)) { this.encoder = new Soundex(); } else if ("caverphone1".equalsIgnoreCase(encoder)) { this.encoder = new Caverphone1(); } else if ("caverphone2".equalsIgnoreCase(encoder)) { this.encoder = new Caverphone2(); } else if ("caverphone".equalsIgnoreCase(encoder)) { this.encoder = new Caverphone2(); } else if ("refined_soundex".equalsIgnoreCase(encoder) || "refinedSoundex".equalsIgnoreCase(encoder)) { this.encoder = new RefinedSoundex(); } else if ("cologne".equalsIgnoreCase(encoder)) { this.encoder = new ColognePhonetic(); } else if ("double_metaphone".equalsIgnoreCase(encoder) || "doubleMetaphone".equalsIgnoreCase(encoder)) { DoubleMetaphone doubleMetaphone = new DoubleMetaphone(); doubleMetaphone.setMaxCodeLen(settings.getAsInt("max_code_len", doubleMetaphone.getMaxCodeLen())); this.encoder = doubleMetaphone; } else { throw new ElasticSearchIllegalArgumentException( "unknown encoder [" + encoder + "] for phonetic token filter"); } }
From source file:org.elasticsearch.index.analysis.PhoneticTokenFilterFactory.java
public PhoneticTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {/*from w w w . ja va 2s .c om*/ super(indexSettings, name, settings); this.languageset = null; this.nametype = null; this.ruletype = null; this.maxcodelength = 0; this.replace = settings.getAsBoolean("replace", true); // weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default String encodername = settings.get("encoder", "metaphone"); if ("metaphone".equalsIgnoreCase(encodername)) { this.encoder = new Metaphone(); } else if ("soundex".equalsIgnoreCase(encodername)) { this.encoder = new Soundex(); } else if ("caverphone1".equalsIgnoreCase(encodername)) { this.encoder = new Caverphone1(); } else if ("caverphone2".equalsIgnoreCase(encodername)) { this.encoder = new Caverphone2(); } else if ("caverphone".equalsIgnoreCase(encodername)) { this.encoder = new Caverphone2(); } else if ("refined_soundex".equalsIgnoreCase(encodername) || "refinedSoundex".equalsIgnoreCase(encodername)) { this.encoder = new RefinedSoundex(); } else if ("cologne".equalsIgnoreCase(encodername)) { this.encoder = new ColognePhonetic(); } else if ("double_metaphone".equalsIgnoreCase(encodername) || "doubleMetaphone".equalsIgnoreCase(encodername)) { this.encoder = null; this.maxcodelength = settings.getAsInt("max_code_len", 4); } else if ("bm".equalsIgnoreCase(encodername) || "beider_morse".equalsIgnoreCase(encodername) || "beidermorse".equalsIgnoreCase(encodername)) { this.encoder = null; this.languageset = settings.getAsList("languageset"); String ruleType = settings.get("rule_type", "approx"); if ("approx".equalsIgnoreCase(ruleType)) { ruletype = RuleType.APPROX; } else if ("exact".equalsIgnoreCase(ruleType)) { ruletype = RuleType.EXACT; } else { throw new IllegalArgumentException( "No matching rule type [" + ruleType + "] for beider morse encoder"); } String nameType = settings.get("name_type", "generic"); if ("GENERIC".equalsIgnoreCase(nameType)) { nametype = NameType.GENERIC; } else if ("ASHKENAZI".equalsIgnoreCase(nameType)) { nametype = NameType.ASHKENAZI; } else if ("SEPHARDIC".equalsIgnoreCase(nameType)) { nametype = NameType.SEPHARDIC; } } else if ("koelnerphonetik".equalsIgnoreCase(encodername)) { this.encoder = new KoelnerPhonetik(); } else if ("haasephonetik".equalsIgnoreCase(encodername)) { this.encoder = new HaasePhonetik(); } else if ("nysiis".equalsIgnoreCase(encodername)) { this.encoder = new Nysiis(); } else if ("daitch_mokotoff".equalsIgnoreCase(encodername)) { this.encoder = new DaitchMokotoffSoundex(); } else { throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter"); } }
From source file:org.mitre.opensextant.phonetic.Phoneticizer.java
public Phoneticizer() { // populate the algorithms Map with an instance of each encoder // first the ones from Apache Commons BeiderMorseEncoder bmExact = new BeiderMorseEncoder(); bmExact.setRuleType(RuleType.EXACT); bmExact.setConcat(false);/*w w w . j ava2s . com*/ BeiderMorseEncoder bmApprox = new BeiderMorseEncoder(); bmApprox.setRuleType(RuleType.APPROX); bmApprox.setConcat(false); // StringEncoder caver = new Caverphone(); StringEncoder caver1 = new Caverphone1(); StringEncoder caver2 = new Caverphone2(); StringEncoder colgne = new ColognePhonetic(); DoubleMetaphone doubleMeta = new DoubleMetaphone(); doubleMeta.setMaxCodeLen(10); StringEncoder meta = new Metaphone(); StringEncoder refinedSound = new RefinedSoundex(); StringEncoder sound = new Soundex(); // now, the home-brewed ones StringEncoder noop = new NullEncoder(); StringEncoder caser = new CaseEncoder(); StringEncoder diaRemover = new DiacriticEncoder(); StringEncoder punctRemover = new PunctEncoder(); StringEncoder simple0 = new SimplePhonetic0Encoder(); StringEncoder simple1 = new SimplePhonetic1Encoder(); StringEncoder simple2 = new SimplePhonetic2Encoder(); // not really language encodings // StringEncoder qcode = new QCodec(); // StringEncoder qpcode = new QuotedPrintableCodec(); // StringEncoder urlcode = new URLCodec(); algorithms.put("Beider-Morse-Exact", bmExact); algorithms.put("Beider-Morse-Approximate", bmApprox); // algorithms.put("CaverPhone", caver); algorithms.put("CaverPhone_1.0", caver1); algorithms.put("CaverPhone_2.0", caver2); algorithms.put("Cologne_Phonetic", colgne); algorithms.put("Double_Metaphone", doubleMeta); algorithms.put("Metaphone", meta); algorithms.put("Refined_Soundex", refinedSound); algorithms.put("Soundex", sound); algorithms.put("Nothing", noop); algorithms.put("Case_Insensitive", caser); algorithms.put("Diacritic_Insensitive", diaRemover); algorithms.put("Puncuation_Insensitive", punctRemover); algorithms.put("Simple_Phonetic0", simple0); algorithms.put("Simple_Phonetic1", simple1); algorithms.put("Simple_Phonetic2", simple2); // not really language encodings // algorithms.put("Q Code", qcode); // algorithms.put("Q Printable", qpcode); // algorithms.put("URL Code", urlcode); }
From source file:org.opensextant.phonetic.Phoneticizer.java
public Phoneticizer() { // populate the algorithms Map with an instance of each encoder // first the ones from Apache Commons BeiderMorseEncoder bmExact = new BeiderMorseEncoder(); bmExact.setRuleType(RuleType.EXACT); bmExact.setConcat(false);//from www .ja v a2 s .c o m BeiderMorseEncoder bmApprox = new BeiderMorseEncoder(); bmApprox.setRuleType(RuleType.APPROX); bmApprox.setConcat(false); // StringEncoder caver = new Caverphone(); StringEncoder caver1 = new Caverphone1(); StringEncoder caver2 = new Caverphone2(); StringEncoder colgne = new ColognePhonetic(); DoubleMetaphone doubleMeta = new DoubleMetaphone(); doubleMeta.setMaxCodeLen(10); StringEncoder meta = new Metaphone(); StringEncoder refinedSound = new RefinedSoundex(); StringEncoder sound = new Soundex(); // now, the home-brewed ones StringEncoder noop = new NullEncoder(); StringEncoder caser = new CaseEncoder(); StringEncoder diaRemover = new DiacriticEncoder(); StringEncoder punctRemover = new PunctEncoder(); StringEncoder simple0 = new SimplePhonetic0Encoder(); StringEncoder simple0Solr = new SimplePhonetic0SolrEncoder(); StringEncoder simple0SolrPlus = new SimplePhonetic0SolrPlusEncoder(); StringEncoder simple1 = new SimplePhonetic1Encoder(); StringEncoder simple2 = new SimplePhonetic2Encoder(); // not really language encodings // StringEncoder qcode = new QCodec(); // StringEncoder qpcode = new QuotedPrintableCodec(); // StringEncoder urlcode = new URLCodec(); algorithms.put("Beider-Morse-Exact", bmExact); algorithms.put("Beider-Morse-Approximate", bmApprox); // algorithms.put("CaverPhone", caver); algorithms.put("CaverPhone_1.0", caver1); algorithms.put("CaverPhone_2.0", caver2); algorithms.put("Cologne_Phonetic", colgne); algorithms.put("Double_Metaphone", doubleMeta); algorithms.put("Metaphone", meta); algorithms.put("Refined_Soundex", refinedSound); algorithms.put("Soundex", sound); algorithms.put("Nothing", noop); algorithms.put("Case_Insensitive", caser); algorithms.put("Diacritic_Insensitive", diaRemover); algorithms.put("Puncuation_Insensitive", punctRemover); algorithms.put("Simple_Phonetic0", simple0); algorithms.put("Simple_Phonetic0Solr", simple0Solr); algorithms.put("Simple_Phonetic0SolrPlus", simple0SolrPlus); algorithms.put("Simple_Phonetic1", simple1); algorithms.put("Simple_Phonetic2", simple2); // not really language encodings // algorithms.put("Q Code", qcode); // algorithms.put("Q Printable", qpcode); // algorithms.put("URL Code", urlcode); }