Java tutorial
/* * Phon - An open source tool for research in phonology. * Copyright (C) 2005 - 2015, Gregory Hedlund <ghedlund@mun.ca> and Yvan Rose <yrose@mun.ca> * Dept of Linguistics, Memorial University <https://phon.ca> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package ca.phon.ipadictionary; import java.io.Serializable; import java.text.ParseException; import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import ca.phon.ipa.IPATranscript; /** * Rule for handling contractions. * * Rules are in format * * A -> B * * Where A is a set of conditions for matching * this rule and B is a list of instructions for * building the transcription. * * A conditions can be matched for the left-hand * contraction, transcript of left-hand contraction * and same on right side. Conditions can be specified * as plain text, regular expressions or phonex expressions. * * B instructions an include any of the input values * or literal values. * */ public final class ContractionRule implements Serializable { private static final long serialVersionUID = 3962967640160708840L; public enum ConditionType { PLAIN, REGEX, PHONEX; } public static ContractionRule parseContractionRule(String text) { final ContractionRule cr = new ContractionRule(); final String regex = "([a-zA-Z.]+):\\\"([^\"]+)\\\""; final Pattern pattern = Pattern.compile(regex); final Matcher matcher = pattern.matcher(text); while (matcher.find()) { final String key = matcher.group(1); final String val = matcher.group(2); if (key.equalsIgnoreCase("split")) { // TODO } else if (key.startsWith("lhs") && !(key.startsWith("lhs.ipa"))) { if (key.endsWith(".regex")) { cr.setLhsType(ConditionType.REGEX); } else if (key.endsWith(".phonex")) { cr.setLhsType(ConditionType.PHONEX); } else { cr.setLhsType(ConditionType.PLAIN); } cr.setLhsExpr(val); } else if (key.startsWith("lhs.ipa")) { if (key.endsWith(".regex")) { cr.setTlhsType(ConditionType.REGEX); } else if (key.endsWith(".phonex")) { cr.setTlhsType(ConditionType.PHONEX); } else { cr.setTlhsType(ConditionType.PLAIN); } cr.setTlhsExpr(val); } else if (key.startsWith("rhs") && !(key.startsWith("rhs.ipa"))) { if (key.endsWith(".regex")) { cr.setRhsType(ConditionType.REGEX); } else if (key.endsWith(".phonex")) { cr.setRhsType(ConditionType.PHONEX); } else { cr.setRhsType(ConditionType.PLAIN); } cr.setRhsExpr(val); } else if (key.startsWith("rhs.ipa")) { if (key.endsWith(".regex")) { cr.setTrhsType(ConditionType.REGEX); } else if (key.endsWith(".phonex")) { cr.setTrhsType(ConditionType.PHONEX); } else { cr.setTrhsType(ConditionType.PLAIN); } cr.setTrhsExpr(val); } else if (key.startsWith("expr")) { cr.setVExpr(val); } } return cr; } /* By default all expressions are regex with '.*' as the value. (i.e., match anything) */ private String lhsExpr = ".*"; private ConditionType lhsType = ConditionType.REGEX; private String rhsExpr = ".*"; private ConditionType rhsType = ConditionType.REGEX; private String tlhsExpr = ".*"; private ConditionType tlhsType = ConditionType.REGEX; private String trhsExpr = ".*"; private ConditionType trhsType = ConditionType.REGEX; /** * V expression * * There are 4 variable fields allowed in the 'V' string: * * 1) ${LHS} | ${lhs} * Left-hand side ortho value * 2) ${RHS} | ${rhs} * Right-hand side ortho value * 3) ${transcript:LHS} | ${transcript:lhs} * Left-hand side IPA transcript * 4) ${transcript:RHS} | ${transcript:rhs} * Right-hand side IPA transcript * * Any other field starting with a '$' will trigger * an error. Literal value strings can also be * part of the expression. */ private String vExpr; private transient List<VClause> tBuilder = new ArrayList<VClause>(); private class VClause { ValueClause type; String value; } private enum ValueClause { LHS, RHS, T_LHS, T_RHS, LITERAL; private static final String[] exprs = { "\\$\\{lhs\\}", "\\$\\{rhs\\}", "\\$\\{lhs.ipa\\}", "\\$\\{rhs.ipa\\}", "[^${}+]*" }; public static ValueClause getClause(String str) { ValueClause retVal = ValueClause.LITERAL; for (int i = 0; i < exprs.length; i++) { String expr = exprs[i]; if (str.matches(expr)) { retVal = ValueClause.values()[i]; break; } } return retVal; } } public ContractionRule() { super(); } public String getLhsExpr() { return lhsExpr; } public void setLhsExpr(String lhsExpr) { this.lhsExpr = lhsExpr; } public ConditionType getLhsType() { return lhsType; } public void setLhsType(ConditionType lhsType) { this.lhsType = lhsType; } public String getRhsExpr() { return rhsExpr; } public void setRhsExpr(String rhsExpr) { this.rhsExpr = rhsExpr; } public ConditionType getRhsType() { return rhsType; } public void setRhsType(ConditionType rhsType) { this.rhsType = rhsType; } public String getTlhsExpr() { return tlhsExpr; } public void setTlhsExpr(String tlhsExpr) { this.tlhsExpr = tlhsExpr; } public ConditionType getTlhsType() { return tlhsType; } public void setTlhsType(ConditionType tlhsType) { this.tlhsType = tlhsType; } public String getTrhsExpr() { return trhsExpr; } public void setTrhsExpr(String trhsExpr) { this.trhsExpr = trhsExpr; } public ConditionType getTrhsType() { return trhsType; } public void setTrhsType(ConditionType trhsType) { this.trhsType = trhsType; } public List<VClause> getTBuilder() { return tBuilder; } public void setTBuilder(List<VClause> builder) { tBuilder = builder; } public String getVExpr() { return vExpr; } public void setVExpr(String expr) { vExpr = expr; parseVExpr(); } private void parseVExpr() { // split string on '+' String[] vparts = vExpr.split("\\+"); tBuilder.clear(); for (String part : vparts) { ValueClause type = ValueClause.getClause(StringUtils.strip(part)); VClause clause = new VClause(); clause.type = type; clause.value = StringUtils.strip(part); tBuilder.add(clause); } } /** * Check to see if the given string matches our LHS+RHS * expressions. * * @param lhs - lhs (ortho) * @param rhs - rhs (ortho) * @param tlhs - lhs (ipa) * @param thrs - rhs (ipa) */ public boolean matches(String lhs, String rhs, String tlhs, String trhs) { boolean retVal = checkExpr(lhsExpr, lhs, lhsType) && checkExpr(rhsExpr, rhs, rhsType) && checkExpr(tlhsExpr, tlhs, tlhsType) && checkExpr(trhsExpr, trhs, trhsType); return retVal; } private boolean checkExpr(String expr, String value, ConditionType type) { boolean retVal = false; if (type == ConditionType.PHONEX) { try { final IPATranscript transcript = IPATranscript.parseIPATranscript(value); retVal = transcript.matches(expr); } catch (ParseException e) { e.printStackTrace(); } } else if (type == ConditionType.PLAIN) { retVal = value.equals(expr); } else if (type == ConditionType.REGEX) { retVal = value.matches(expr); } return retVal; } /** * * @param lhs - lhs (ortho) * @param rhs - rhs (ortho) * @param tlhs - lhs (ipa) * @param thrs - rhs (ipa) * @return */ public String buildTranscript(String lhs, String rhs, String tlhs, String trhs) { String retVal = ""; if (tBuilder == null) { tBuilder = new ArrayList<VClause>(); parseVExpr(); } if (matches(lhs, rhs, tlhs, trhs)) { for (VClause vclause : tBuilder) { if (vclause.type == ValueClause.LHS) { retVal += lhs; } else if (vclause.type == ValueClause.RHS) { retVal += rhs; } else if (vclause.type == ValueClause.T_LHS) { retVal += tlhs; } else if (vclause.type == ValueClause.T_RHS) { retVal += trhs; } else if (vclause.type == ValueClause.LITERAL) { retVal += vclause.value; } else { Logger.getLogger(getClass().getName()).warning("Unknown clause type"); } } } return retVal; } }