org.sifarish.etl.CountryStandardFormat.java Source code

Java tutorial

Introduction

Here is the source code for org.sifarish.etl.CountryStandardFormat.java

Source

/*
 * Sifarish: Recommendation Engine
 * Author: Pranab Ghosh
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package org.sifarish.etl;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.sifarish.feature.DynamicAttrSimilarityStrategy;

/**
 * Country specific formats for different kinds of structured text data
 * @author pranab
 *
 */
public abstract class CountryStandardFormat {
    protected Map<String, String> stateCodes = new HashMap<String, String>();

    private static String fullName = "(\\w{2,})\\s+(\\w{2,})\\s+(\\w{2,})";
    private static String firstFullName = "(\\w{2,})\\s+(\\w{2,})";
    private static String firstFullMidIntialName = "(\\w{2,})\\s+(\\w{1})\\s+(\\w{2,})";
    private static String firstMidIntialName = "(\\w{1})\\s+(\\w{1})\\s+(\\w{2,})";
    private static String lastNameFirstFirstIntialName = "(\\w{2,}),\\s+(\\w{1})";
    private static String lastNameFirstFirstName = "(\\w{2,}),\\s+(\\w{2,})";
    private static String lastNameFirstFirstMidIntialName = "(\\w{2,}),\\s+(\\w{1})\\s+(\\w{1})";
    private static String lastNameFirstFirstMidName = "(\\w{2,}),\\s+(\\w{2,})\\s+(\\w{2,})";

    private static Pattern fullNamePattern = Pattern.compile(fullName);
    private static Pattern firstFullNamePattern = Pattern.compile(firstFullName);
    private static Pattern firstFullMidIntialNamePattern = Pattern.compile(firstFullMidIntialName);
    private static Pattern firstMidIntialNamePattern = Pattern.compile(firstMidIntialName);
    private static Pattern lastNameFirstFirstIntialNamePattern = Pattern.compile(lastNameFirstFirstIntialName);
    private static Pattern lastNameFirstFirstNamePattern = Pattern.compile(lastNameFirstFirstName);
    private static Pattern lastNameFirstFirstMidIntialNamePattern = Pattern
            .compile(lastNameFirstFirstMidIntialName);
    private static Pattern lastNameFirstFirstMidNamePattern = Pattern.compile(lastNameFirstFirstMidName);

    /**
     * @param country
     * @return
     */
    public static CountryStandardFormat createCountryStandardFormat(String country,
            StructuredTextNormalizer textNormalizer) {
        CountryStandardFormat countryFormat = null;
        if (country.equals("USA")) {
            countryFormat = new UnitedStatesStandardFormat(textNormalizer);
        } else {
            throw new IllegalArgumentException("invalid country name");
        }

        return countryFormat;
    }

    /**
     * 
     */
    public CountryStandardFormat() {
        super();
        intializeStateCodes();
    }

    /**
      * initializes state codes
      */
    public abstract void intializeStateCodes();

    /**
     * case based formatting
     * @param item
     * @param format
     * @return
     */
    public abstract String caseFormat(String item, String format);

    /**
     * phone number formatting
     * @param item
     * @param format
     * @return
     */
    public abstract String phoneNumFormat(String item, String format);

    /**
     * state name formatting
     * @param item
     * @return
     */
    public abstract String stateFormat(String item) throws IOException;

    /**
     * @param item
     * @param fuzzyMatch
     * @param textSimStrategy
     * @param minDist
     * @return
     * @throws IOException
     */
    public abstract String stateFormat(String item, boolean fuzzyMatch,
            DynamicAttrSimilarityStrategy textSimStrategy, double minDist) throws IOException;

    /**
     * @param item
     * @param format
     * @return
     */
    public String personNameFormat(String item) {
        String firstFull = null;
        String firstInitial = null;
        String middleFull = null;
        String middleInitial = null;
        String last = null;
        boolean matchFound = false;

        //first full,  mid full, last
        Matcher matcher = fullNamePattern.matcher(item);
        if (matcher.matches()) {
            firstFull = matcher.group(1);
            middleFull = matcher.group(2);
            last = matcher.group(3);
            matchFound = true;
        }

        //first full, last
        if (!matchFound) {
            matcher = firstFullNamePattern.matcher(item);
            if (matcher.matches()) {
                firstFull = matcher.group(1);
                last = matcher.group(2);
                matchFound = true;
            }
        }

        //first full, middle initial,last
        if (!matchFound) {
            matcher = firstFullMidIntialNamePattern.matcher(item);
            if (matcher.matches()) {
                firstFull = matcher.group(1);
                middleInitial = matcher.group(2);
                last = matcher.group(3);
                matchFound = true;
            }
        }

        //first , middle initial,last
        if (!matchFound) {
            matcher = firstMidIntialNamePattern.matcher(item);
            if (matcher.matches()) {
                firstInitial = matcher.group(1);
                middleInitial = matcher.group(2);
                last = matcher.group(3);
                matchFound = true;
            }
        }

        //last , first initial
        if (!matchFound) {
            matcher = lastNameFirstFirstIntialNamePattern.matcher(item);
            if (matcher.matches()) {
                last = matcher.group(1);
                firstInitial = matcher.group(2);
                matchFound = true;
            }
        }

        //last , first
        if (!matchFound) {
            matcher = lastNameFirstFirstNamePattern.matcher(item);
            if (matcher.matches()) {
                last = matcher.group(1);
                firstFull = matcher.group(2);
                matchFound = true;
            }
        }

        //last , first initial, mid intial
        if (!matchFound) {
            matcher = lastNameFirstFirstMidIntialNamePattern.matcher(item);
            if (matcher.matches()) {
                last = matcher.group(1);
                firstInitial = matcher.group(2);
                middleInitial = matcher.group(3);
                matchFound = true;
            }
        }

        //last , first , mid 
        if (!matchFound) {
            matcher = lastNameFirstFirstMidNamePattern.matcher(item);
            if (matcher.matches()) {
                last = matcher.group(1);
                firstFull = matcher.group(2);
                middleFull = matcher.group(3);
                matchFound = true;
            }
        }

        if (null != firstFull) {
            firstFull = StringUtils.capitalize(firstFull.toLowerCase());
        }
        if (null != firstInitial) {
            firstInitial = StringUtils.upperCase(firstInitial);
        }
        if (null != middleFull) {
            middleFull = StringUtils.capitalize(middleFull.toLowerCase());
        }
        if (null != middleInitial) {
            middleInitial = StringUtils.upperCase(middleInitial);
        }
        if (null != last) {
            last = StringUtils.capitalize(last.toLowerCase());
        }

        StringBuilder stBld = new StringBuilder();
        if (null != firstFull) {
            stBld.append(firstFull).append(" ");
        } else if (null != firstInitial) {
            stBld.append(firstInitial).append(" ");
        }

        if (null != middleFull) {
            stBld.append(middleFull).append(" ");
        } else if (null != middleInitial) {
            stBld.append(middleInitial).append(" ");
        }

        if (null != last) {
            stBld.append(last);
        }

        return stBld.toString();
    }

    /**
     * @param item
     * @return
     */
    public abstract String streetAddressFormat(String item) throws IOException;

    /**
     * @param item
     * @return
     */
    public abstract String addressFormat(String item) throws IOException;

    /**
     * @param item
     * @return
     */
    public abstract String streetAddressOneFormat(String item) throws IOException;

    /**
     * @param item
     * @param fuzzyMatch
     * @param textSimStrategy
     * @param minDist
     * @return
     * @throws IOException
     */
    public abstract String streetAddressOneFormat(String item, boolean fuzzyMatch,
            DynamicAttrSimilarityStrategy textSimStrategy, double minDist) throws IOException;

    /**
     * @param item
     * @return
     */
    public abstract String streetAddressTwoFormat(String item) throws IOException;

    /**
     * @param item
     * @param fuzzyMatch
     * @param textSimStrategy
     * @param minDist
     * @return
     * @throws IOException
     */
    public abstract String streetAddressTwoFormat(String item, boolean fuzzyMatch,
            DynamicAttrSimilarityStrategy textSimStrategy, double minDist) throws IOException;

    /**
     * @param item
     * @param format
     * @return
     */
    public String emailFormat(String item, String format) {
        String[] elements = item.split("@");
        String name = elements[0];
        if (format.equals("lowerCase")) {
            name = name.toLowerCase();
        } else if (format.equals("upperCase")) {
            name = name.toUpperCase();
        } else if (format.equals("capitalize")) {
            name = StringUtils.capitalize(name.toLowerCase());
        } else {
            throw new IllegalArgumentException("invalid case format");
        }

        return name + "@" + elements[1];
    }

    /**
     * @param item
     * @return
     */
    public String removePunctuations(String item) {
        String newItem = item.replaceAll("\\.", "");
        newItem = item.replaceAll(",", "");
        return newItem;
    }

    /**
     * @param component
     * @param tokenNormalizer
     * @param textSimStrategy
     * @param minDist
     * @return
     * @throws IOException
     */
    protected Pair<Boolean, String> fuzyyMatchComponent(String component, TextFieldTokenNormalizer tokenNormalizer,
            DynamicAttrSimilarityStrategy textSimStrategy, double minDist) throws IOException {
        String newComponent = component;
        boolean fuzzyMatched = false;
        if (!tokenNormalizer.containsNormalize(component)) {
            //try fuzzy matching
            Pair<String, Double> match = tokenNormalizer.fuzzymatchWithUnnormalized(component, textSimStrategy);
            if (match.getRight() <= minDist) {
                newComponent = match.getLeft();
                newComponent = tokenNormalizer.normalize(newComponent);
                fuzzyMatched = true;
            } else {
                match = tokenNormalizer.fuzzymatchWithNormalized(component, textSimStrategy);
                if (match.getRight() <= minDist) {
                    newComponent = match.getLeft();
                    fuzzyMatched = true;
                }
            }
        }
        return new ImmutablePair<Boolean, String>(fuzzyMatched, newComponent);
    }

}