Java tutorial
/////////////////////////////////////////////////////////////////////////////// //Copyright (C) 2012 Assaf Urieli // //This file is part of Talismane. // //Talismane is free software: you can redistribute it and/or modify //it under the terms of the GNU Affero General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. // //Talismane is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU Affero General Public License for more details. // //You should have received a copy of the GNU Affero General Public License //along with Talismane. If not, see <http://www.gnu.org/licenses/>. ////////////////////////////////////////////////////////////////////////////// package com.joliciel.frenchTreebank.search; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.StringTokenizer; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.xerces.parsers.DOMParser; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import com.joliciel.frenchTreebank.Category; import com.joliciel.frenchTreebank.Entity; import com.joliciel.frenchTreebank.Function; import com.joliciel.frenchTreebank.Phrase; import com.joliciel.frenchTreebank.PhraseType; import com.joliciel.frenchTreebank.PhraseUnit; import com.joliciel.frenchTreebank.Sentence; import com.joliciel.frenchTreebank.SubCategory; import com.joliciel.frenchTreebank.TreebankException; import com.joliciel.frenchTreebank.TreebankService; import com.joliciel.frenchTreebank.Word; import com.joliciel.frenchTreebank.util.UnicodeReader; import com.joliciel.talismane.utils.LogUtils; public class XmlPatternSearchImpl implements XmlPatternSearch { private static final Log LOG = LogFactory.getLog(XmlPatternSearchImpl.class); String xmlPattern = ""; TreebankService treebankService; private static final String NULL_STRING = "[null]"; int phraseCounter = 0; int phraseUnitCounter = 0; int phraseSubunitCounter = 0; public TreebankService getTreebankService() { return treebankService; } public void setTreebankService(TreebankService treebankService) { this.treebankService = treebankService; } public List<SearchResult> perform() { DOMParser parser = new DOMParser(); StringReader reader = new StringReader(this.xmlPattern); InputSource inputSource = new InputSource(reader); try { parser.parse(inputSource); } catch (SAXException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } Document document = parser.getDocument(); Element firstPhraseTag = document.getDocumentElement(); PhraseNode phraseNode = (PhraseNode) this.traverse(firstPhraseTag, 0); List<String> tablesToReturn = new ArrayList<String>(); List<String> tables = new ArrayList<String>(); List<String> conditions = new ArrayList<String>(); List<String> orderBy = new ArrayList<String>(); this.getSQLElements(phraseNode, tablesToReturn, tables, conditions, orderBy); List<List<Entity>> stuff = this.treebankService.findStuff(tablesToReturn, tables, conditions, orderBy); List<SearchResult> searchResults = new ArrayList<SearchResult>(); for (List<Entity> oneRow : stuff) { SearchResultImpl searchResult = new SearchResultImpl(); for (Entity entity : oneRow) { if (entity instanceof Sentence) searchResult.setSentence((Sentence) entity); else if (entity instanceof Phrase) searchResult.setPhrase((Phrase) entity); else if (entity instanceof PhraseUnit) searchResult.getPhraseUnits().add((PhraseUnit) entity); } for (Entity entity : oneRow) { if (entity instanceof Word) { Word lemma = (Word) entity; for (PhraseUnit punit : searchResult.getPhraseUnits()) { if (punit.getLemmaId() == lemma.getId()) { punit.setLemma(lemma); } } } } searchResults.add(searchResult); } return searchResults; } private void getSQLElements(PhraseMemberNode node, List<String> tablesToReturn, List<String> tables, List<String> conditions, List<String> orderBy) { List<String> localConditions = conditions; List<String> localTables = tables; List<String> localTablesToReturn = tablesToReturn; List<String> localOrderBy = orderBy; // if the node doesn't exist, we'll need to put all of its conditions into a NOT EXISTS clause // hence we store them in a separate List if (!node.exists()) { localConditions = new ArrayList<String>(); localTables = new ArrayList<String>(); localTablesToReturn = new ArrayList<String>(); localOrderBy = new ArrayList<String>(); } if (node instanceof PhraseNode) { PhraseNode phraseNode = (PhraseNode) node; localTables.add("ftb_phrase as " + phraseNode.getAlias()); if (phraseNode.getParent() != null) { localConditions.add(phraseNode.getAlias() + ".phrase_parent_id = " + phraseNode.getParent().getAlias() + ".phrase_id"); } else { // this is the top-level phrase node if (!phraseNode.exists) throw new TreebankException( "The top-level phrase has to exist, otherwise what are searching for?"); // want to return the parent phrase englobing all the other ones localTablesToReturn.add("ftb_phrase as " + phraseNode.getAlias()); // include sentence englobing this phrase String sentenceAlias = "s"; localTables.add("ftb_sentence as " + sentenceAlias); String sentencePhraseAlias = "sp"; localTables.add("ftb_phrase as " + sentencePhraseAlias); String phraseChildAlias = "pc"; localTables.add("ftb_phrase_child as " + phraseChildAlias); localConditions.add(sentenceAlias + ".sentence_id = " + phraseChildAlias + ".pchild_phrase_id"); localConditions .add(phraseNode.getAlias() + ".phrase_id = " + phraseChildAlias + ".pchild_child_id"); localConditions.add(sentencePhraseAlias + ".phrase_id =" + sentenceAlias + ".sentence_id"); localConditions.add(sentencePhraseAlias + ".phrase_id = " + phraseChildAlias + ".pchild_phrase_id"); localTablesToReturn.add("ftb_sentence as " + sentenceAlias); localTablesToReturn.add("ftb_phrase as " + sentencePhraseAlias); localOrderBy.add(sentenceAlias + "_sentence_file_id"); localOrderBy.add(sentenceAlias + "_sentence_id"); } this.addConditions(phraseNode, localConditions); for (PhraseMemberNode child : phraseNode.getChildNodes()) { this.getSQLElements(child, localTablesToReturn, localTables, localConditions, localOrderBy); } // next child node // Phrase node } else { // Word node WordNode wordNode = (WordNode) node; String phraseUnitAlias = wordNode.getAlias(); localTables.add("ftb_phrase_unit as " + phraseUnitAlias); boolean wordNodeExists = wordNode.exists(); PhraseNode parent = wordNode.getParent(); while (parent != null) { if (!parent.exists()) { wordNodeExists = false; break; } parent = parent.getParent(); } if (wordNodeExists) { // want to return the phrase units localTablesToReturn.add("ftb_phrase_unit as " + phraseUnitAlias); // add the lemma String lemmaAlias = "w" + wordNode.getNodeIndex(); localTables.add("ftb_word as " + lemmaAlias); localTablesToReturn.add("ftb_word as " + lemmaAlias); localConditions.add(lemmaAlias + ".word_id = " + phraseUnitAlias + ".punit_lemma_id"); } localConditions .add(phraseUnitAlias + ".punit_phrase_id = " + wordNode.getParent().getAlias() + ".phrase_id"); this.addConditions(wordNode, localConditions); // Let's see if there are any sub-words if (wordNode.getChildNodes().size() > 0) { for (ComponentWordNode subWordNode : wordNode.getChildNodes()) { localTables.add("ftb_phrase_subunit as " + subWordNode.getAlias()); localConditions.add( subWordNode.getAlias() + ".psubunit_punit_id = " + wordNode.getAlias() + ".punit_id"); this.addConditions(subWordNode, localConditions); } // ordering of subwords ComponentWordNode previousNode = null; for (ComponentWordNode subWordNode : wordNode.getChildNodes()) { if (previousNode != null) localConditions.add(previousNode.getAlias() + ".psubunit_position < " + subWordNode.getAlias() + ".psubunit_position"); previousNode = subWordNode; } } // Word node } if (node.getParent() != null) { // Ordering condition PhraseMemberNode previousNode = null; PhraseMemberNode nextNode = null; boolean foundCurrent = false; for (PhraseMemberNode child : node.getParent().getChildNodes()) { if (child.equals(node)) foundCurrent = true; else if (child.exists()) { if (foundCurrent && nextNode == null) { nextNode = child; break; } if (!foundCurrent) previousNode = child; } } String myPositionColumn = ".phrase_position"; if (node instanceof WordNode) myPositionColumn = ".punit_pos_in_phrase"; if (previousNode != null) { if (previousNode instanceof PhraseNode) { localConditions.add( previousNode.getAlias() + ".phrase_position < " + node.getAlias() + myPositionColumn); } else { localConditions.add(previousNode.getAlias() + ".punit_pos_in_phrase < " + node.getAlias() + myPositionColumn); } } if (!node.exists()) { // in this case, we also have to add a condition concerning the next node if (nextNode != null) { if (nextNode instanceof PhraseNode) { localConditions.add(node.getAlias() + myPositionColumn + " < " + nextNode.getAlias() + ".phrase_position"); } else { localConditions.add(node.getAlias() + myPositionColumn + " < " + nextNode.getAlias() + ".punit_pos_in_phrase"); } } } // if the node doesn't exist, add the non existence condition, based on all of the local conditions if (!node.exists()) { String nonExistenceCondition; if (node instanceof PhraseNode) nonExistenceCondition = " NOT EXISTS (SELECT " + node.getAlias() + ".phrase_id"; else nonExistenceCondition = " NOT EXISTS (SELECT " + node.getAlias() + ".punit_id"; boolean firstOne = true; for (String table : localTables) { if (firstOne) { nonExistenceCondition += " FROM " + table; firstOne = false; } else { nonExistenceCondition += ", " + table; } } firstOne = true; for (String condition : localConditions) { if (firstOne) { nonExistenceCondition += " WHERE " + condition; firstOne = false; } else { nonExistenceCondition += " AND " + condition; } } nonExistenceCondition += ")"; conditions.add(nonExistenceCondition); } } } private void addConditions(PhraseNode phraseNode, List<String> conditions) { // add the phrase type condition if (phraseNode.getTypeCodes().size() > 0) { String condition = phraseNode.getAlias() + ".phrase_ptype_id "; List<PhraseType> phraseTypes = new ArrayList<PhraseType>(); for (String phraseTypeCode : phraseNode.getTypeCodes()) { PhraseType phraseType = this.treebankService.loadPhraseType(phraseTypeCode); phraseTypes.add(phraseType); } if (phraseTypes.size() == 1) condition += " = " + phraseTypes.get(0).getId(); else { condition += " IN ("; boolean firstOne = true; for (PhraseType phraseType : phraseTypes) { if (firstOne) { condition += phraseType.getId(); firstOne = false; } else { condition += ", " + phraseType.getId(); } } condition += ")"; } conditions.add(condition); } // add the function condition if (phraseNode.getFunctionCodes().size() > 0) { String condition = phraseNode.getAlias() + ".phrase_function_id "; List<Function> functions = new ArrayList<Function>(); for (String functionCode : phraseNode.getFunctionCodes()) { Function function = this.treebankService.loadFunction(functionCode); functions.add(function); } if (functions.size() == 1) condition += " = " + functions.get(0).getId(); else { condition += " IN ("; boolean firstOne = true; for (Function function : functions) { if (firstOne) { condition += function.getId(); firstOne = false; } else { condition += ", " + function.getId(); } } condition += ")"; } conditions.add(condition); } } private void addConditions(WordNode wordNode, List<String> wordConditions) { // add the category condition if (wordNode.getCategoryCodes().size() > 0) { String condition = wordNode.getAlias() + ".punit_cat_id "; List<Category> categories = new ArrayList<Category>(); for (String categoryCode : wordNode.getCategoryCodes()) { Category category = this.treebankService.loadCategory(categoryCode); categories.add(category); } if (categories.size() == 1) condition += " = " + categories.get(0).getId(); else { condition += " IN ("; boolean firstOne = true; for (Category category : categories) { if (firstOne) { condition += category.getId(); firstOne = false; } else { condition += ", " + category.getId(); } } condition += ")"; } wordConditions.add(condition); } // add the subCategory condition if (wordNode.getSubCategoryCodes().size() > 0) { if (wordNode.getCategoryCodes().size() != 1) throw new TreebankException( "Subcategories can only be used if exactly one category has been specified."); Category category = this.treebankService.loadCategory(wordNode.getCategoryCodes().get(0)); String condition = wordNode.getAlias() + ".punit_subcat_id "; if (wordNode.getSubCategoryCodes().size() == 1 && wordNode.getSubCategoryCodes().get(0).equals(NULL_STRING)) condition += " is null"; else { List<SubCategory> subCategories = new ArrayList<SubCategory>(); for (String subCategoryText : wordNode.getSubCategoryCodes()) { SubCategory subCategory = this.treebankService.loadSubCategory(category, subCategoryText); subCategories.add(subCategory); } if (subCategories.size() == 1) { condition += " = " + subCategories.get(0).getId(); } else { condition += " IN ("; boolean firstOne = true; for (SubCategory subCategory : subCategories) { if (firstOne) { condition += subCategory.getId(); firstOne = false; } else { condition += ", " + subCategory.getId(); } } condition += ")"; } } wordConditions.add(condition); } // add the lemma condition if (wordNode.getLemmas().size() > 0) { String condition = wordNode.getAlias() + ".punit_lemma_id "; List<Word> lemmas = new ArrayList<Word>(); for (String lemmaText : wordNode.getLemmas()) { List<Word> lemmasForText = this.treebankService.findWords(lemmaText); lemmas.addAll(lemmasForText); } if (lemmas.size() == 1) condition += " = " + lemmas.get(0).getId(); else { condition += " IN ("; boolean firstOne = true; for (Word lemma : lemmas) { if (firstOne) { condition += lemma.getId(); firstOne = false; } else { condition += ", " + lemma.getId(); } } condition += ")"; } wordConditions.add(condition); } // add the word condition if (wordNode.getWords().size() > 0) { String condition = wordNode.getAlias() + ".punit_word_id "; List<Word> words = new ArrayList<Word>(); for (String wordText : wordNode.getWords()) { List<Word> wordsForText = this.treebankService.findWords(wordText); words.addAll(wordsForText); } if (words.size() == 1) condition += " = " + words.get(0).getId(); else { condition += " IN ("; boolean firstOne = true; for (Word word : words) { if (firstOne) { condition += word.getId(); firstOne = false; } else { condition += ", " + word.getId(); } } condition += ")"; } wordConditions.add(condition); } } private void addConditions(ComponentWordNode subWordNode, List<String> wordConditions) { // add the category condition if (subWordNode.getCategoryCodes().size() > 0) { String condition = subWordNode.getAlias() + ".psubunit_cat_id "; if (subWordNode.getCategoryCodes().size() == 1 && subWordNode.getCategoryCodes().get(0).equals(NULL_STRING)) condition += " is null"; else { List<Category> categories = new ArrayList<Category>(); for (String categoryCode : subWordNode.getCategoryCodes()) { Category category = this.treebankService.loadCategory(categoryCode); categories.add(category); } if (categories.size() == 1) condition += " = " + categories.get(0).getId(); else { condition += " IN ("; boolean firstOne = true; for (Category category : categories) { if (firstOne) { condition += category.getId(); firstOne = false; } else { condition += ", " + category.getId(); } } condition += ")"; } } wordConditions.add(condition); } // add the word condition if (subWordNode.getWords().size() > 0) { String condition = subWordNode.getAlias() + ".psubunit_word_id "; List<Word> words = new ArrayList<Word>(); for (String wordText : subWordNode.getWords()) { List<Word> wordsForText = this.treebankService.findWords(wordText); words.addAll(wordsForText); } if (words.size() == 1) condition += " = " + words.get(0).getId(); else { condition += " IN ("; boolean firstOne = true; for (Word word : words) { if (firstOne) { condition += word.getId(); firstOne = false; } else { condition += ", " + word.getId(); } } condition += ")"; } wordConditions.add(condition); } } private PhraseMemberNode traverse(Node node, int depth) { { LOG.debug(depth + " " + node.getNodeName()); PhraseMemberNode xmlPatternNode = null; Element element = (Element) node; String tag = node.getNodeName(); if (tag == "phrase") { String phraseTypes = element.getAttribute("type"); String functionCodes = element.getAttribute("fct"); String existsString = element.getAttribute("exists"); boolean exists = existsString == null || !(existsString.equals("no")); LOG.debug( "Phrase type: " + phraseTypes + ", fct = " + functionCodes + ", exists = " + existsString); xmlPatternNode = new PhraseNode(phraseTypes, functionCodes, exists, phraseCounter++); } else if (tag == "w") { String categoryCode = element.getAttribute("cat"); String subCategoryCode = element.getAttribute("subcat"); String lemma = element.getAttribute("lemma"); String word = element.getTextContent(); String existsString = element.getAttribute("exists"); boolean exists = existsString == null || !(existsString.equals("no")); LOG.debug("w: cat=" + categoryCode + ", subcat = " + subCategoryCode + ", lemma=" + lemma + ", word=" + word + ", exists = " + existsString); xmlPatternNode = new WordNode(categoryCode, subCategoryCode, lemma, word, exists, phraseUnitCounter++); } else { String phraseTypes = node.getNodeName(); String functionCodes = element.getAttribute("fct"); String existsString = element.getAttribute("exists"); boolean exists = existsString == null || !(existsString.equals("no")); LOG.debug( "Phrase type: " + phraseTypes + ", fct = " + functionCodes + ", exists = " + existsString); xmlPatternNode = new PhraseNode(phraseTypes, functionCodes, exists, phraseCounter++); } if (xmlPatternNode instanceof PhraseNode) { if (node.hasChildNodes()) { Node child = node.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE) { PhraseNode phraseNode = (PhraseNode) xmlPatternNode; PhraseMemberNode childNode = this.traverse(child, depth + 1); phraseNode.addNode(childNode); childNode.setParent(phraseNode); } child = child.getNextSibling(); } } } else if (xmlPatternNode instanceof WordNode) { if (node.hasChildNodes()) { Node child = node.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE) { WordNode wordNode = (WordNode) xmlPatternNode; ComponentWordNode childNode = this.getComponentWordNode(child); wordNode.addNode(childNode); childNode.setParent(wordNode); } child = child.getNextSibling(); } } } return xmlPatternNode; } } private ComponentWordNode getComponentWordNode(Node node) { Element element = (Element) node; String tag = node.getNodeName(); if (tag != "w") throw new TreebankException("Only w elements are allowed beneath a w element"); String categoryCode = element.getAttribute("cat"); String word = element.getTextContent(); LOG.debug("component w: cat=" + categoryCode + ", word=" + word); ComponentWordNode componentWordNode = new ComponentWordNode(categoryCode, word, phraseSubunitCounter++); return componentWordNode; } public String getXmlPattern() { return xmlPattern; } public void setXmlPattern(String xmlPattern) { this.xmlPattern = xmlPattern; } private interface XmlPatternNode { public String getAlias(); public int getNodeIndex(); } private interface PhraseMemberNode extends XmlPatternNode { public PhraseNode getParent(); public void setParent(PhraseNode parent); public boolean exists(); } private static final class PhraseNode implements PhraseMemberNode { private List<String> typeCodes = new ArrayList<String>(); private List<String> functionCodes = new ArrayList<String>(); private List<PhraseMemberNode> childNodes = new ArrayList<PhraseMemberNode>(); private PhraseNode parent = null; private String alias = ""; private int nodeIndex; private boolean exists = true; public PhraseNode(String typeString, String functionString, boolean exists, int nodeIndex) { StringTokenizer st = new StringTokenizer(typeString, ",", false); while (st.hasMoreTokens()) typeCodes.add(st.nextToken().trim()); st = new StringTokenizer(functionString, ",", false); while (st.hasMoreTokens()) functionCodes.add(st.nextToken().trim()); this.exists = exists; this.alias = "p" + nodeIndex; this.nodeIndex = nodeIndex; } public List<String> getTypeCodes() { return typeCodes; } public List<String> getFunctionCodes() { return functionCodes; } public boolean exists() { return exists; } public void addNode(PhraseMemberNode node) { childNodes.add(node); } public List<PhraseMemberNode> getChildNodes() { return this.childNodes; } public PhraseNode getParent() { return parent; } public void setParent(PhraseNode parent) { this.parent = parent; } public String getAlias() { return alias; } public int getNodeIndex() { return nodeIndex; } } private static final class WordNode implements PhraseMemberNode { private List<String> subCategoryCodes = new ArrayList<String>(); private List<String> categoryCodes = new ArrayList<String>(); private List<String> lemmas = new ArrayList<String>(); private List<String> words = new ArrayList<String>(); private List<ComponentWordNode> childNodes = new ArrayList<ComponentWordNode>(); private PhraseNode parent = null; private String alias = ""; private int nodeIndex; private boolean exists = true; public WordNode(String categoryString, String subCategoryString, String lemmaString, String wordString, boolean exists, int nodeIndex) { StringTokenizer st = new StringTokenizer(categoryString, ",", false); while (st.hasMoreTokens()) categoryCodes.add(st.nextToken().trim()); st = new StringTokenizer(subCategoryString, ",", false); while (st.hasMoreTokens()) subCategoryCodes.add(st.nextToken().trim()); st = new StringTokenizer(lemmaString, ",", false); while (st.hasMoreTokens()) lemmas.add(st.nextToken().trim()); st = new StringTokenizer(wordString, ",", false); while (st.hasMoreTokens()) words.add(st.nextToken().trim()); this.exists = exists; this.alias = "pu" + nodeIndex; this.nodeIndex = nodeIndex; } public List<String> getCategoryCodes() { return categoryCodes; } public List<String> getSubCategoryCodes() { return subCategoryCodes; } public List<String> getLemmas() { return lemmas; } public List<String> getWords() { return words; } public boolean exists() { return exists; } public PhraseNode getParent() { return parent; } public void setParent(PhraseNode parent) { this.parent = parent; } public String getAlias() { return alias; } public int getNodeIndex() { return nodeIndex; } public void addNode(ComponentWordNode node) { childNodes.add(node); } public List<ComponentWordNode> getChildNodes() { return this.childNodes; } } private static final class ComponentWordNode implements XmlPatternNode { private List<String> categoryCodes = new ArrayList<String>(); private List<String> words = new ArrayList<String>(); private WordNode parent = null; private String alias = ""; private int nodeIndex; public ComponentWordNode(String categoryString, String wordString, int nodeIndex) { StringTokenizer st = new StringTokenizer(categoryString, ",", false); while (st.hasMoreTokens()) categoryCodes.add(st.nextToken().trim()); st = new StringTokenizer(wordString, ",", false); while (st.hasMoreTokens()) { String word = st.nextToken().trim(); if (word.length() > 0) words.add(word); } this.alias = "psu" + nodeIndex; this.nodeIndex = nodeIndex; } public List<String> getCategoryCodes() { return categoryCodes; } public List<String> getWords() { return words; } @SuppressWarnings("unused") public WordNode getParent() { return parent; } public void setParent(WordNode parent) { this.parent = parent; } public String getAlias() { return alias; } public int getNodeIndex() { return nodeIndex; } } public void setXmlPatternFile(String xmlPatternFile) { try { StringBuffer fileData = new StringBuffer(1000); Reader reader = new UnicodeReader(new FileInputStream(xmlPatternFile), "UTF-8"); char[] buf = new char[1024]; int numRead = 0; while ((numRead = reader.read(buf)) != -1) { String readData = String.valueOf(buf, 0, numRead); fileData.append(readData); buf = new char[1024]; } reader.close(); xmlPattern = fileData.toString(); LOG.debug(xmlPattern); } catch (FileNotFoundException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } } }