List of usage examples for org.jdom2 Element getChildren
public List<Element> getChildren(final String cname, final Namespace ns)
List
of all the child elements nested directly (one level deep) within this element with the given local name and belonging to the given Namespace, returned as Element
objects. From source file:de.hbrs.oryx.yawl.converter.layout.YAWLLayoutConverter.java
License:Open Source License
/** * Converts the all decorators of a YAWL task. There may be two decorators, each with alignment TOP, LEFT, RIGHT, BOTTOM. * //from w w w . j a va 2 s .com * @param yawlContainer * the container element of the YAWL task * @param layoutInformation * already converted layout of the YAWL task */ private void convertDecorator(final Element yawlContainer, final NetElementLayout layoutInformation) { @SuppressWarnings("rawtypes") List yawlDecoratorList = yawlContainer.getChildren("decorator", yawlNamespace); if (yawlDecoratorList != null) { for (Object o : yawlDecoratorList) { Element yawlDecorator = (Element) o; NetElementLayout.DecoratorType decoratorType = convertDecoratorType(yawlDecorator); if (yawlDecorator.getAttributeValue("type").contains("join")) { layoutInformation.setJoinDecorator(decoratorType); } if (yawlDecorator.getAttributeValue("type").contains("split")) { layoutInformation.setSplitDecorator(decoratorType); } } } }
From source file:de.huberlin.german.korpling.laudatioteitool.SplitTEI.java
License:Apache License
private TEIValidator.Errors extractDocumentHeaders(Document doc) throws LaudatioException, IOException, SAXException { TEIValidator validator = documentSchemeURL == null ? new TEIDocumentValidator() : new FromURLValidator(documentSchemeURL); File documentDir = new File(outputDirectory, "DocumentHeader"); if (!documentDir.exists() && !documentDir.mkdir()) { throw new LaudatioException( messages.getString("COULD NOT CREATE DIRECTORY") + documentDir.getAbsolutePath()); }/*from w w w . j a va 2 s . c o m*/ Element documentRoot = Preconditions.checkNotNull(doc.getRootElement().getChild("teiCorpus", null)); for (Element docHeader : documentRoot.getChildren("teiHeader", null)) { Preconditions.checkState("DocumentHeader".equals(docHeader.getAttributeValue("type"))); // create the subtree for the global corpus header Namespace teiNS = Namespace.getNamespace("http://www.tei-c.org/ns/1.0"); Element tei = new Element("TEI", teiNS); tei.addContent(docHeader.clone()); Document newDoc = new Document(tei); if (documentSchemeURL == null) { newDoc.addContent(0, new ProcessingInstruction("xml-model", "href=\"" + TEIDocumentValidator.DEFAULT_SCHEME_URL + "\"")); } else { newDoc.addContent(0, new ProcessingInstruction("xml-model", "href=\"" + documentSchemeURL + "\"")); } // we need to append an empty "text" element after the header Element text = new Element("text", teiNS); text.setText(""); tei.addContent(text); Element fileDesc = Preconditions .checkNotNull(tei.getChild("teiHeader", null).getChild("fileDesc", null)); String outName = UUID.randomUUID().toString(); String id = fileDesc.getAttributeValue("id", Namespace.XML_NAMESPACE); if (id != null) { outName = id; } else { Element titleStmt = Preconditions.checkNotNull(fileDesc.getChild("titleStmt", null)); String title = titleStmt.getChildText("title", null); if (title != null) { outName = title; } } File outputFile = new File(documentDir, outName + ".xml"); XMLOutputter xmlOut = new XMLOutputter(Format.getPrettyFormat()); xmlOut.output(newDoc, new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8")); log.info(messages.getString("WRITTEN DOCUMENT HEADER"), outputFile.getPath()); validator.validate(outputFile); } return validator.getErrors(); }
From source file:de.huberlin.german.korpling.laudatioteitool.SplitTEI.java
License:Apache License
private TEIValidator.Errors extractPreparationSteps(Document doc) throws LaudatioException, IOException, SAXException { TEIValidator validator = preparationSchemeURL == null ? new TEIPreparationValidator() : new FromURLValidator(preparationSchemeURL); Multiset<String> knownPreparationTitles = HashMultiset.create(); File documentDir = new File(outputDirectory, "PreparationHeader"); if (!documentDir.exists() && !documentDir.mkdir()) { throw new LaudatioException( messages.getString("COULD NOT CREATE DIRECTORY") + documentDir.getAbsolutePath()); }/*from w w w . j a v a2 s . c o m*/ Preconditions.checkNotNull(doc.getRootElement().getChild("teiCorpus", null)); Element preparationRoot = Preconditions .checkNotNull(doc.getRootElement().getChild("teiCorpus", null).getChild("teiCorpus", null)); for (Element preparationHeader : preparationRoot.getChildren("teiHeader", null)) { Preconditions.checkState("PreparationHeader".equals(preparationHeader.getAttributeValue("type"))); // create the subtree for the global corpus header Namespace teiNS = Namespace.getNamespace("http://www.tei-c.org/ns/1.0"); Element tei = new Element("TEI", teiNS); tei.addContent(preparationHeader.clone()); Document newDoc = new Document(tei); if (preparationSchemeURL == null) { newDoc.addContent(0, new ProcessingInstruction("xml-model", "href=\"" + TEIPreparationValidator.DEFAULT_SCHEME_URL + "\"")); } else { newDoc.addContent(0, new ProcessingInstruction("xml-model", "href=\"" + preparationSchemeURL + "\"")); } // we need to append an empty "text" element after the header Element text = new Element("text", teiNS); text.setText(""); tei.addContent(text); Element fileDesc = Preconditions .checkNotNull(tei.getChild("teiHeader", null).getChild("fileDesc", null)); String outName = UUID.randomUUID().toString(); Element titleStmt = Preconditions.checkNotNull(fileDesc.getChild("titleStmt", null)); Element title = Preconditions.checkNotNull(titleStmt.getChild("title", null)); String corresp = title.getAttributeValue("corresp"); if (corresp != null) { if (knownPreparationTitles.contains(corresp)) { knownPreparationTitles.add(corresp); outName = corresp + "_" + knownPreparationTitles.count(corresp); log.warn(messages.getString("MORE THAN ONE PREPARATION HEADER"), corresp); } else { outName = corresp; knownPreparationTitles.add(corresp); } } File outputFile = new File(documentDir, outName + ".xml"); XMLOutputter xmlOut = new XMLOutputter(Format.getPrettyFormat()); xmlOut.output(newDoc, new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8")); log.info(messages.getString("WRITTEN PREPARATION HEADER"), outputFile.getPath()); validator.validate(outputFile); } return validator.getErrors(); }
From source file:de.intranda.goobi.plugins.sru.SRUHelper.java
License:Open Source License
public static Element getRecordWithoutSruHeader(Document document) { Element root = document.getRootElement(); // <srw:records> Element srw_records = root.getChild("records", SRW); // <srw:record> if (srw_records == null) { return null; }/*from w w w . jav a2 s . co m*/ List<Element> srw_recordList = srw_records.getChildren("record", SRW); // <srw:recordData> if (srw_recordList == null || srw_recordList.isEmpty()) { return null; } Element recordData = srw_recordList.get(0).getChild("recordData", SRW); Element record = recordData.getChild("record", MARC); return record; }
From source file:de.nava.informa.parsers.Atom_0_3_Parser.java
License:Open Source License
/** * Looks for "content" elements and takes first from them or looks for "summary" element if * "content" not found./*from w w w . j a va 2s. com*/ * * @param item item element. * @param namespace namespace. * @return description for item. */ public static String getDescription(Element item, Namespace namespace) { String strDesc = ""; Element elDesc; List contents = item.getChildren("content", namespace); if (contents.size() > 0) { elDesc = (Element) contents.get(0); } else { elDesc = item.getChild("summary", namespace); } if (elDesc != null) { strDesc = getValue(elDesc); } return strDesc; }
From source file:de.nava.informa.parsers.Atom_0_3_Parser.java
License:Open Source License
/** * @see de.nava.informa.core.ChannelParserIF#parse(de.nava.informa.core.ChannelBuilderIF, org.jdom2.Element) *//*from w ww . ja v a 2 s . com*/ public ChannelIF parse(ChannelBuilderIF cBuilder, Element channel) throws ParseException { if (cBuilder == null) { throw new RuntimeException("Without builder no channel can " + "be created."); } Date dateParsed = new Date(); Namespace defNS = ParserUtils.getDefaultNS(channel); if (defNS == null) { defNS = Namespace.NO_NAMESPACE; LOGGER.info("No default namespace found."); } // RSS 1.0 Dublin Core Module namespace Namespace dcNS = ParserUtils.getNamespace(channel, "dc"); if (dcNS == null) { LOGGER.debug("No namespace for dublin core found"); dcNS = defNS; } LOGGER.debug("start parsing."); // get version attribute String formatVersion = "0.3"; if (channel.getAttribute("version") != null) { formatVersion = channel.getAttribute("version").getValue().trim(); LOGGER.debug("Atom version " + formatVersion + " specified in document."); } else { LOGGER.info("No format version specified, using default."); } // --- read in channel information // Lower the case of these tags to simulate case-insensitive parsing ParserUtils.matchCaseOfChildren(channel, new String[] { "title", "description", "tagline", "ttl", "modified", "author", "generator", "copyright", "link", "entry" }); // title element ChannelIF chnl = cBuilder.createChannel(channel, channel.getChildTextTrim("title", defNS)); // TODO: support attributes: type, mode chnl.setFormat(ChannelFormat.ATOM_0_3); // language String language = channel.getAttributeValue("lang", Namespace.XML_NAMESPACE); if (language != null) { chnl.setLanguage(language); } // description element if (channel.getChild("description") != null) { chnl.setDescription(channel.getChildTextTrim("description", defNS)); } else { // fallback chnl.setDescription(channel.getChildTextTrim("tagline", defNS)); } // ttl in dc namespace Element ttl = channel.getChild("ttl", dcNS); if (ttl != null) { String ttlString = ttl.getTextTrim(); if (ttlString != null) { chnl.setTtl(Integer.parseInt(ttlString)); } } // lastbuild element : modified ? Element modified = channel.getChild("modified", defNS); if (modified != null) { chnl.setPubDate(ParserUtils.getDate(modified.getTextTrim())); } // TODO : issued value /* if (modified != null) { modified = channel.getChild("issued", defNS); chnl.setLastBuildDate (ParserUtils.getDate(modified.getTextTrim())); } */ // author element Element author = channel.getChild("author", defNS); if (author != null) { ParserUtils.matchCaseOfChildren(author, "name"); chnl.setCreator(author.getChildTextTrim("name", defNS)); } // generator element Element generator = channel.getChild("generator", defNS); if (generator != null) { chnl.setGenerator(generator.getTextTrim()); } // copyright element Element copyright = channel.getChild("copyright", defNS); if (copyright != null) { chnl.setCopyright(getCopyright(copyright)); } // n link elements // TODO : type attribut of link (text, application...) List links = channel.getChildren("link", defNS); Iterator i = links.iterator(); while (i.hasNext()) { Element linkElement = (Element) i.next(); // use first 'alternate' link String rel = linkElement.getAttributeValue("rel"); String href = linkElement.getAttributeValue("href"); if ((rel != null) && (href != null) && rel.equals("alternate")) { URL linkURL = ParserUtils.getURL(href); chnl.setSite(linkURL); break; } // TODO: further extraction of link information } // 1..n entry elements List items = channel.getChildren("entry", defNS); i = items.iterator(); while (i.hasNext()) { Element item = (Element) i.next(); // Lower the case of these tags to simulate case-insensitive parsing ParserUtils.matchCaseOfChildren(item, new String[] { "title", "link", "content", "summary", "issued", "subject" }); // get title element // TODO : deal with type attribut Element elTitle = item.getChild("title", defNS); String strTitle = "<No Title>"; if (elTitle != null) { strTitle = getTitle(elTitle); LOGGER.debug("Parsing title " + elTitle.getTextTrim() + "->" + strTitle); } if (LOGGER.isDebugEnabled()) { LOGGER.debug("Entry element found (" + strTitle + ")."); } // get link element String strLink = AtomParserUtils.getItemLink(item, defNS); // get description element String strDesc = getDescription(item, defNS); // generate new news item (link to article) ItemIF curItem = cBuilder.createItem(item, chnl, strTitle, strDesc, ParserUtils.getURL(strLink)); curItem.setFound(dateParsed); // get issued element (required) Element elIssued = item.getChild("issued", defNS); if (elIssued == null) { // [adewale@gmail.com, 01-May-2005] Fix for blogs which have // 'created' dates, but not 'issued' dates -- in clear contravention // of the Atom 0.3 spec. Element elCreated = item.getChild("created", defNS); if (elCreated != null) { curItem.setDate(ParserUtils.getDate(elCreated.getTextTrim())); } } else { curItem.setDate(ParserUtils.getDate(elIssued.getTextTrim())); } // get subject element Element elSubject = item.getChild("subject", dcNS); if (elSubject != null) { // TODO: Mulitple subject elements not handled currently curItem.setSubject(elSubject.getTextTrim()); } } // set to current date chnl.setLastUpdated(dateParsed); return chnl; }
From source file:de.nava.informa.parsers.Atom_1_0_Parser.java
License:Open Source License
/** * Looks for "content" elements and takes first from them or looks for * "summary" element if "content" not found. * * @param item item element.//from ww w.jav a 2s . co m * @param namespace namespace. * @return description for item. */ public static String getDescription(Element item, Namespace namespace) { String strDesc = ""; Element elDesc; // TODO there should be some way of knowing if we are returning summary or // content List contents = item.getChildren("content", namespace); if (contents.size() > 0) { elDesc = (Element) contents.get(0); } else { elDesc = item.getChild("summary", namespace); } if (elDesc != null) { strDesc = AtomParserUtils.getValue(elDesc, getMode(elDesc)); } return strDesc; }
From source file:de.nava.informa.parsers.Atom_1_0_Parser.java
License:Open Source License
/** * @see de.nava.informa.core.ChannelParserIF#parse(de.nava.informa.core.ChannelBuilderIF, org.jdom2.Element) *//*from w w w .j a v a 2s . com*/ public ChannelIF parse(ChannelBuilderIF cBuilder, Element channel) throws ParseException { if (cBuilder == null) { throw new RuntimeException("Without builder no channel can " + "be created."); } Date dateParsed = new Date(); Namespace defNS = ParserUtils.getDefaultNS(channel); if (defNS == null) { defNS = Namespace.NO_NAMESPACE; LOGGER.info("No default namespace found."); } else if ((defNS.getURI() == null) || !defNS.getURI().equals("http://www.w3.org/2005/Atom")) { LOGGER.warn("Namespace is not really supported, still trying assuming Atom 1.0 format"); } LOGGER.debug("start parsing."); // --- read in channel information // Lower the case of these tags to simulate case-insensitive parsing ParserUtils.matchCaseOfChildren(channel, new String[] { "title", "subtitle", "updated", "published", "author", "generator", "rights", "link", "entry" }); // TODO icon and logo: Feed element can have upto 1 logo and icon. // TODO id: Feed and all entries have a unique id string. This can // be the URL of the website. Supporting this will require API change. // TODO: Feed can optionally have category information // title element ChannelIF chnl = cBuilder.createChannel(channel, channel.getChildTextTrim("title", defNS)); chnl.setFormat(ChannelFormat.ATOM_1_0); // description element if (channel.getChild("subtitle") != null) { chnl.setDescription(channel.getChildTextTrim("subtitle", defNS)); } // TODO: should we use summary element? // lastbuild element : updated ? Element updated = channel.getChild("updated", defNS); if (updated != null) { chnl.setPubDate(ParserUtils.getDate(updated.getTextTrim())); } // author element List authors = channel.getChildren("author", defNS); chnl.setCreator(getAuthorString(authors, defNS)); // TODO we are ignoring contributors information // generator element Element generator = channel.getChild("generator", defNS); if (generator != null) { chnl.setGenerator(generator.getTextTrim()); } // TODO generator can have URI and version information // copyright element Element rights = channel.getChild("rights", defNS); if (rights != null) { chnl.setCopyright(AtomParserUtils.getValue(rights, getMode(rights))); } List links = channel.getChildren("link", defNS); Iterator i = links.iterator(); URL linkUrl = null; while (i.hasNext()) { Element linkElement = (Element) i.next(); // use first 'alternate' link // if rel is not present, use first link without rel String rel = linkElement.getAttributeValue("rel"); String href = linkElement.getAttributeValue("href"); // TODO we need to handle relative links also if ((rel == null) && (href != null) && (linkUrl == null)) { linkUrl = ParserUtils.getURL(href); } else if ((rel != null) && (href != null) && rel.equals("alternate")) { linkUrl = ParserUtils.getURL(href); break; } } if (linkUrl != null) { chnl.setSite(linkUrl); } List items = channel.getChildren("entry", defNS); i = items.iterator(); while (i.hasNext()) { Element item = (Element) i.next(); // Lower the case of these tags to simulate case-insensitive parsing ParserUtils.matchCaseOfChildren(item, new String[] { "title", "link", "content", "summary", "published", "author" }); // TODO entry, if copied from some other feed, may have source element // TODO each entry can have its own rights declaration // get title element Element elTitle = item.getChild("title", defNS); String strTitle = "<No Title>"; if (elTitle != null) { strTitle = AtomParserUtils.getValue(elTitle, getMode(elTitle)); LOGGER.debug("Parsing title " + elTitle.getTextTrim() + "->" + strTitle); } if (LOGGER.isDebugEnabled()) { LOGGER.debug("Entry element found (" + strTitle + ")."); } // get link element String strLink = AtomParserUtils.getItemLink(item, defNS); // get description element String strDesc = getDescription(item, defNS); // generate new news item (link to article) ItemIF curItem = cBuilder.createItem(item, chnl, strTitle, strDesc, ParserUtils.getURL(strLink)); //TODO enclosure data curItem.setFound(dateParsed); List itemAuthors = item.getChildren("author", defNS); curItem.setCreator(getAuthorString(itemAuthors, defNS)); // get published element Element elIssued = item.getChild("published", defNS); if (elIssued == null) { // published element may not be present (but updated should be) Element elUpdated = item.getChild("updated", defNS); // TODO there should be some way to determining which one are we // returning if (elUpdated != null) { curItem.setDate(ParserUtils.getDate(elUpdated.getTextTrim())); } } else { curItem.setDate(ParserUtils.getDate(elIssued.getTextTrim())); } // get list of category elements List elCategoryList = item.getChildren("category", defNS); // categories present will be stored here Collection<CategoryIF> categories = new ArrayList<>(); // multiple category elements may be present for (Object elCategoryItem : elCategoryList) { Element elCategory = (Element) elCategoryItem; // notice: atom spec. forbids to have category "term" (="subject") // set as inner text of category tags, so we have to read it from // the "term" attribute if (elCategory != null) { // TODO: what if we have more than one category element present? // subject would be overwritten each loop and therefore represent only // the last category read, so does this make any sense? // TODO: what about adding functionality for accessing "label" or "scheme" attributes? // if set, a label should be displayed instead of the value set in term // we keep this line not to break up things which // use getSubject() to read an item category curItem.setSubject(elCategory.getAttributeValue("term")); CategoryIF c = new Category(elCategory.getAttributeValue("term")); // add current category to category list categories.add(c); } } // assign categories curItem.setCategories(categories); } // set to current date chnl.setLastUpdated(dateParsed); return chnl; }
From source file:de.nava.informa.parsers.RSS_1_0_Parser.java
License:Open Source License
public ChannelIF parse(ChannelBuilderIF cBuilder, Element root) throws ParseException { if (cBuilder == null) { throw new RuntimeException("Without builder no channel can " + "be created."); }//from w w w . ja va 2s . com Date dateParsed = new Date(); Namespace defNS = ParserUtils.getDefaultNS(root); if (defNS == null) { defNS = Namespace.NO_NAMESPACE; logger.info("No default namespace found."); } // RSS 1.0 Dublin Core Module namespace Namespace dcNS = ParserUtils.getNamespace(root, "dc"); // fall back to default name space (for retrieving descriptions) if (dcNS == null) { dcNS = defNS; } // RSS 1.0 Syndication Module namespace Namespace syNS = ParserUtils.getNamespace(root, "sy"); // RSS 1.0 Aggregation Module namespace Namespace agNS = ParserUtils.getNamespace(root, "ag"); // RSS 1.0 Administration Module namespace Namespace adminNS = ParserUtils.getNamespace(root, "admin"); // RSS 1.0 DCTerms Module namespace Namespace dctermsNS = ParserUtils.getNamespace(root, "dcterms"); // RSS 1.0 Annotation Module namespace Namespace annotateNS = ParserUtils.getNamespace(root, "annotate"); // RSS091 Module namespace Namespace rss091NS = ParserUtils.getNamespace(root, "rss091"); // Content namespace Namespace contentNS = ParserUtils.getNamespace(root, "content"); ParserUtils.matchCaseOfChildren(root, new String[] { "channel", "item", "image", "textinput" }); // Get the channel element (only one occurs) Element channel = root.getChild("channel", defNS); if (channel == null) { logger.warn("Channel element could not be retrieved from feed."); throw new ParseException("No channel element found in feed."); } // ----------------------- read in channel information ParserUtils.matchCaseOfChildren(channel, new String[] { "title", "description", "link", "creator", "managingEditor", "publisher", "errorReportsTo", "webMaster", "language", "rights", "copyright", "rating", "date", "issued", "pubdate", "lastBuildDate", "modified", "generatorAgent", "updatePeriod", "updateFrequency", "updateBase" }); // title element ChannelIF chnl = cBuilder.createChannel(channel, channel.getChildTextTrim("title", defNS)); // set channel format chnl.setFormat(ChannelFormat.RSS_1_0); // description element chnl.setDescription(channel.getChildTextTrim("description", defNS)); // link element chnl.setSite(ParserUtils.getURL(channel.getChildTextTrim("link", defNS))); // creator element Element creator = channel.getChild("creator", dcNS); if (creator == null) { creator = channel.getChild("managingEditor", rss091NS); } if (creator != null) { chnl.setCreator(creator.getTextTrim()); } // publisher element String publisher = channel.getChildTextTrim("publisher", dcNS); if (publisher == null) { Element elErrorReportsTo = channel.getChild("errorReportsTo", adminNS); if (elErrorReportsTo != null) { publisher = elErrorReportsTo.getAttributeValue("resource", ParserUtils.getNamespace(elErrorReportsTo, "rdf")); } } if (publisher == null) { publisher = channel.getChildTextTrim("webMaster", rss091NS); } chnl.setPublisher(publisher); // language element Element language = channel.getChild("language", dcNS); if (language == null) { language = channel.getChild("language", rss091NS); } if (language != null) { chnl.setLanguage(language.getTextTrim()); } // rights element Element copyright = channel.getChild("rights", dcNS); if (copyright == null) { copyright = channel.getChild("copyright", rss091NS); } if (copyright != null) { chnl.setCopyright(copyright.getTextTrim()); } // 0..1 Rating element Element rating = channel.getChild("rating", rss091NS); if (rating != null) { chnl.setRating(rating.getTextTrim()); } // 0..1 Docs element // use namespace URI chnl.setDocs(defNS.getURI()); // 0..1 pubDate element Element pubDate = channel.getChild("date", dcNS); if (pubDate == null) { pubDate = channel.getChild("issued", dctermsNS); } if (pubDate == null) { pubDate = channel.getChild("pubdate", rss091NS); } if (pubDate != null) { chnl.setPubDate(ParserUtils.getDate(pubDate.getTextTrim())); } // 0..1 lastBuildDate element Element lastBuildDate = channel.getChild("lastBuildDate"); if (lastBuildDate == null) { lastBuildDate = channel.getChild("modified", dctermsNS); } if (lastBuildDate == null) { lastBuildDate = channel.getChild("lastBuildDate", rss091NS); } if (lastBuildDate != null) { chnl.setLastBuildDate(ParserUtils.getDate(lastBuildDate.getTextTrim())); } // RSS 1.0 Administration Module support // 0..1 generator element Element elGenerator = channel.getChild("generatorAgent", adminNS); if (elGenerator != null) { Attribute generator = elGenerator.getAttribute("resource", ParserUtils.getNamespace(elGenerator, "rdf")); if (generator != null) { chnl.setGenerator(generator.getValue()); } } // RSS 1.0 Syndication Module support // 0..1 update period element Element updatePeriod = channel.getChild("updatePeriod", syNS); if (updatePeriod != null) { try { ChannelUpdatePeriod channelUpdatePeriod = ChannelUpdatePeriod .valueFromText(updatePeriod.getTextTrim()); chnl.setUpdatePeriod(channelUpdatePeriod); } catch (IllegalArgumentException ex) { logger.warn(updatePeriod.getTextTrim(), ex); } } // 0..1 update frequency element Element updateFrequency = channel.getChild("updateFrequency", syNS); if (updateFrequency != null) { chnl.setUpdateFrequency((new Integer(updateFrequency.getTextTrim())).intValue()); } // 0..1 update base element Element updateBase = channel.getChild("updateBase", syNS); if (updateBase != null) { chnl.setUpdateBase(ParserUtils.getDate(updateBase.getTextTrim())); } if ((updatePeriod != null) && updateFrequency != null) { int ttl = getTTL(chnl.getUpdatePeriod(), chnl.getUpdateFrequency()); chnl.setTtl(ttl); } // item elements List items = root.getChildren("item", defNS); Iterator i = items.iterator(); while (i.hasNext()) { Element item = (Element) i.next(); ParserUtils.matchCaseOfChildren(item, new String[] { "title", "link", "encoded", "description", "creator", "subject", "date", "sourceURL", "source", "timestamp", "reference" }); // get title element Element elTitle = item.getChild("title", defNS); String strTitle = "<No Title>"; if (elTitle != null) { strTitle = elTitle.getTextTrim(); } if (logger.isDebugEnabled()) { logger.debug("Item element found (" + strTitle + ")."); } // get link element Element elLink = item.getChild("link", defNS); String strLink = ""; if (elLink != null) { strLink = elLink.getTextTrim(); } // get description element Element elDesc = item.getChild("encoded", contentNS); if (elDesc == null) { elDesc = item.getChild("description", defNS); } if (elDesc == null) { elDesc = item.getChild("description", dcNS); } String strDesc = ""; if (elDesc != null) { strDesc = elDesc.getTextTrim(); } // generate new RSS item (link to article) ItemIF rssItem = cBuilder.createItem(item, chnl, strTitle, strDesc, ParserUtils.getURL(strLink)); rssItem.setFound(dateParsed); // get creator element Element elCreator = item.getChild("creator", dcNS); if (elCreator != null) { rssItem.setCreator(elCreator.getTextTrim()); } // get subject element Element elSubject = item.getChild("subject", dcNS); if (elSubject != null) { // TODO: Mulitple subject elements not handled currently rssItem.setSubject(elSubject.getTextTrim()); } // get date element Element elDate = item.getChild("date", dcNS); if (elDate != null) { rssItem.setDate(ParserUtils.getDate(elDate.getTextTrim())); } // get source element - default to Aggregation module, then try Dublin Core String sourceName = null; String sourceLocation = null; Date sourceTimestamp = null; Element elSourceURL = item.getChild("sourceURL", agNS); if (elSourceURL == null) { // No Aggregation module - try Dublin Core elSourceURL = item.getChild("source", dcNS); if (elSourceURL != null) { sourceLocation = elSourceURL.getTextTrim(); sourceName = "Source"; } } else { // Aggregation module sourceLocation = elSourceURL.getTextTrim(); Element elSourceName = item.getChild("source", agNS); if (elSourceName != null) { sourceName = elSourceName.getTextTrim(); } Element elSourceTimestamp = item.getChild("timestamp", agNS); if (elSourceTimestamp != null) { sourceTimestamp = ParserUtils.getDate(elSourceTimestamp.getTextTrim()); } } if (sourceLocation != null) { ItemSourceIF itemSource = cBuilder.createItemSource(rssItem, sourceName, sourceLocation, sourceTimestamp); rssItem.setSource(itemSource); } // comments element - use Annotation module Element elReference = item.getChild("reference", annotateNS); if (elReference != null) { Attribute resource = elReference.getAttribute("resource", ParserUtils.getNamespace(elReference, "rdf")); if (resource != null) { URL resourceURL = ParserUtils.getURL(resource.getValue()); if (resourceURL != null) { rssItem.setComments(resourceURL); } } } } // image element Element image = root.getChild("image", defNS); if (image != null) { ParserUtils.matchCaseOfChildren(image, new String[] { "title", "url", "link", "width", "height", "description" }); ImageIF rssImage = cBuilder.createImage(image.getChildTextTrim("title", defNS), ParserUtils.getURL(image.getChildTextTrim("url", defNS)), ParserUtils.getURL(image.getChildTextTrim("link", defNS))); Element imgWidth = image.getChild("width", defNS); if (imgWidth != null) { try { rssImage.setWidth(Integer.parseInt(imgWidth.getTextTrim())); } catch (NumberFormatException e) { logger.warn(e); } } Element imgHeight = image.getChild("height", defNS); if (imgHeight != null) { try { rssImage.setHeight(Integer.parseInt(imgHeight.getTextTrim())); } catch (NumberFormatException e) { logger.warn(e); } } Element imgDescr = image.getChild("description", defNS); if (imgDescr != null) { rssImage.setDescription(imgDescr.getTextTrim()); } chnl.setImage(rssImage); } // textinput element Element txtinp = root.getChild("textinput", defNS); if (txtinp != null) { ParserUtils.matchCaseOfChildren(image, new String[] { "title", "description", "name", "link" }); String tiTitle = null; if (txtinp.getChild("title", defNS) != null) { tiTitle = txtinp.getChild("title", defNS).getTextTrim(); } String tiDescr = null; if (txtinp.getChild("description", defNS) != null) { tiDescr = txtinp.getChild("description", defNS).getTextTrim(); } String tiName = null; if (txtinp.getChild("name", defNS) != null) { tiName = txtinp.getChild("name", defNS).getTextTrim(); } URL tiLink = null; if (txtinp.getChild("link", defNS) != null) { tiLink = ParserUtils.getURL(txtinp.getChild("link", defNS).getTextTrim()); } TextInputIF rssTextInput = cBuilder.createTextInput(tiTitle, tiDescr, tiName, tiLink); chnl.setTextInput(rssTextInput); } chnl.setLastUpdated(dateParsed); return chnl; }
From source file:de.nava.informa.parsers.RSS_2_0_Parser.java
License:Open Source License
/** * @see de.nava.informa.core.ChannelParserIF#parse(de.nava.informa.core.ChannelBuilderIF, org.jdom2.Element) *//* w ww.ja v a2s.c om*/ public ChannelIF parse(ChannelBuilderIF cBuilder, Element root) throws ParseException { if (cBuilder == null) { throw new RuntimeException("Without builder no channel can be created."); } Date dateParsed = new Date(); logger.debug("start parsing."); Namespace defNS = ParserUtils.getDefaultNS(root); if (defNS == null) { defNS = Namespace.NO_NAMESPACE; logger.info("No default namespace found."); } Namespace dcNS = ParserUtils.getNamespace(root, "dc"); // fall back to default name space if (dcNS == null) { dcNS = defNS; } // Content namespace Namespace contentNS = ParserUtils.getNamespace(root, "content"); // fall back to default name space if (contentNS == null) { contentNS = defNS; } ParserUtils.matchCaseOfChildren(root, "channel"); // Get the channel element (only one occurs) Element channel = root.getChild("channel", defNS); if (channel == null) { logger.warn("Channel element could not be retrieved from feed."); throw new ParseException("No channel element found in feed."); } // --- read in channel information ParserUtils.matchCaseOfChildren(channel, new String[] { "title", "description", "link", "language", "item", "image", "textinput", "copyright", "rating", "docs", "generator", "pubDate", "lastBuildDate", "category", "managingEditor", "webMaster", "cloud" }); // 1 title element ChannelIF chnl = cBuilder.createChannel(channel, channel.getChildTextTrim("title", defNS)); // set channel format chnl.setFormat(ChannelFormat.RSS_2_0); // 1 description element chnl.setDescription(channel.getChildTextTrim("description", defNS)); // 1 link element chnl.setSite(ParserUtils.getURL(channel.getChildTextTrim("link", defNS))); // 1 language element chnl.setLanguage(channel.getChildTextTrim("language", defNS)); // 1..n item elements List items = channel.getChildren("item", defNS); for (Object item1 : items) { Element item = (Element) item1; ParserUtils.matchCaseOfChildren(item, new String[] { "title", "link", "encoded", "description", "subject", "category", "pubDate", "date", "author", "creator", "comments", "guid", "source", "enclosure" }); // get title element Element elTitle = item.getChild("title", defNS); String strTitle = "<No Title>"; if (elTitle != null) { strTitle = elTitle.getTextTrim(); } if (logger.isDebugEnabled()) { logger.debug("Item element found (" + strTitle + ")."); } // get link element Element elLink = item.getChild("link", defNS); String strLink = ""; if (elLink != null) { strLink = elLink.getTextTrim(); } // get description element Element elDesc = item.getChild("encoded", contentNS); if (elDesc == null) { elDesc = item.getChild("description", defNS); } String strDesc = ""; if (elDesc != null) { strDesc = elDesc.getTextTrim(); } // generate new RSS item (link to article) ItemIF rssItem = cBuilder.createItem(item, chnl, strTitle, strDesc, ParserUtils.getURL(strLink)); // get subject element Element elSubject = item.getChild("subject", defNS); if (elSubject == null) { // fallback mechanism: get dc:subject element elSubject = item.getChild("subject", dcNS); } if (elSubject != null) { rssItem.setSubject(elSubject.getTextTrim()); } // get category list // get list of <category> elements List listCategory = item.getChildren("category", defNS); if (listCategory.size() < 1) { // fallback mechanism: get dc:category element listCategory = item.getChildren("category", dcNS); } if (listCategory.size() > 0) { RecursiveHashtable<String> catTable = new RecursiveHashtable<String>(); // for each category, parse hierarchy for (Object aListCategory : listCategory) { RecursiveHashtable<String> currTable = catTable; Element elCategory = (Element) aListCategory; // get contents of category element String[] titles = elCategory.getTextNormalize().split("/"); for (String title : titles) { // tokenize category string to extract out hierarchy if (!currTable.containsKey(title)) { // if token does not exist in current map, add it with child Hashtable currTable.put(title, new RecursiveHashtable<String>()); } // reset current Hashtable to child's Hashtable then iterate to next token currTable = currTable.get(title); } } ArrayList<CategoryIF> catList = new ArrayList<CategoryIF>(); // transform cat list & hierarchy into list of CategoryIF elements Enumeration<String> enumCategories = catTable.keys(); while (enumCategories.hasMoreElements()) { String key = enumCategories.nextElement(); // build category list: getCategoryList(parent, title, children) CategoryIF cat = getCategoryList(null, key, catTable.get(key)); catList.add(cat); } if (catList.size() > 0) { // if categories were actually created, then add list to item node rssItem.setCategories(catList); } } // get publication date Element elDate = item.getChild("pubDate", defNS); if (elDate == null) { // fallback mechanism: get dc:date element elDate = item.getChild("date", dcNS); } if (elDate != null) { rssItem.setDate(ParserUtils.getDate(elDate.getTextTrim())); } rssItem.setFound(dateParsed); // get Author element Element elAuthor = item.getChild("author", defNS); if (elAuthor == null) { // fallback mechanism: get dc:creator element elAuthor = item.getChild("creator", dcNS); } if (elAuthor != null) rssItem.setCreator(elAuthor.getTextTrim()); // get Comments element Element elComments = item.getChild("comments", defNS); String strComments = ""; if (elComments != null) { strComments = elComments.getTextTrim(); } rssItem.setComments(ParserUtils.getURL(strComments)); // get guid element Element elGuid = item.getChild("guid", defNS); if (elGuid != null) { String guidUrl = elGuid.getTextTrim(); if (guidUrl != null) { boolean permaLink = true; Attribute permaLinkAttribute = elGuid.getAttribute("isPermaLink", defNS); if (permaLinkAttribute != null) { String permaLinkStr = permaLinkAttribute.getValue(); if (permaLinkStr != null) { permaLink = Boolean.valueOf(permaLinkStr); } } ItemGuidIF itemGuid = cBuilder.createItemGuid(rssItem, guidUrl, permaLink); rssItem.setGuid(itemGuid); } } // get source element Element elSource = item.getChild("source", defNS); if (elSource != null) { String sourceName = elSource.getTextTrim(); Attribute sourceAttribute = elSource.getAttribute("url", defNS); if (sourceAttribute != null) { String sourceLocation = sourceAttribute.getValue().trim(); ItemSourceIF itemSource = cBuilder.createItemSource(rssItem, sourceName, sourceLocation, null); rssItem.setSource(itemSource); } } // get enclosure element Element elEnclosure = item.getChild("enclosure", defNS); if (elEnclosure != null) { URL location = null; String type = null; int length = -1; Attribute urlAttribute = elEnclosure.getAttribute("url", defNS); if (urlAttribute != null) { location = ParserUtils.getURL(urlAttribute.getValue().trim()); } Attribute typeAttribute = elEnclosure.getAttribute("type", defNS); if (typeAttribute != null) { type = typeAttribute.getValue().trim(); } Attribute lengthAttribute = elEnclosure.getAttribute("length", defNS); if (lengthAttribute != null) { try { length = Integer.parseInt(lengthAttribute.getValue().trim()); } catch (NumberFormatException e) { logger.warn(e); } } ItemEnclosureIF itemEnclosure = cBuilder.createItemEnclosure(rssItem, location, type, length); rssItem.setEnclosure(itemEnclosure); } } // 0..1 image element Element image = channel.getChild("image", defNS); if (image != null) { ParserUtils.matchCaseOfChildren(image, new String[] { "title", "url", "link", "width", "height", "description" }); ImageIF rssImage = cBuilder.createImage(image.getChildTextTrim("title", defNS), ParserUtils.getURL(image.getChildTextTrim("url", defNS)), ParserUtils.getURL(image.getChildTextTrim("link", defNS))); Element imgWidth = image.getChild("width", defNS); if (imgWidth != null) { try { rssImage.setWidth(Integer.parseInt(imgWidth.getTextTrim())); } catch (NumberFormatException e) { logger.warn("Error parsing width: " + e.getMessage()); } } Element imgHeight = image.getChild("height", defNS); if (imgHeight != null) { try { rssImage.setHeight(Integer.parseInt(imgHeight.getTextTrim())); } catch (NumberFormatException e) { logger.warn("Error parsing height: " + e.getMessage()); } } Element imgDescr = image.getChild("description", defNS); if (imgDescr != null) { rssImage.setDescription(imgDescr.getTextTrim()); } chnl.setImage(rssImage); } // 0..1 textinput element Element txtinp = channel.getChild("textinput", defNS); if (txtinp != null) { ParserUtils.matchCaseOfChildren(txtinp, new String[] { "title", "description", "name", "link" }); TextInputIF rssTextInput = cBuilder.createTextInput(txtinp.getChildTextTrim("title", defNS), txtinp.getChildTextTrim("description", defNS), txtinp.getChildTextTrim("name", defNS), ParserUtils.getURL(txtinp.getChildTextTrim("link", defNS))); chnl.setTextInput(rssTextInput); } // 0..1 copyright element Element copyright = channel.getChild("copyright", defNS); if (copyright != null) { chnl.setCopyright(copyright.getTextTrim()); } // 0..1 Rating element Element rating = channel.getChild("rating", defNS); if (rating != null) { chnl.setRating(rating.getTextTrim()); } // 0..1 Docs element Element docs = channel.getChild("docs", defNS); if (docs != null) { chnl.setDocs(docs.getTextTrim()); } // 0..1 Generator element Element generator = channel.getChild("generator", defNS); if (generator != null) { chnl.setGenerator(generator.getTextTrim()); } // 0..1 ttl element Element ttl = channel.getChild("ttl", defNS); if (ttl != null) { String ttlValue = ttl.getTextTrim(); try { chnl.setTtl(Integer.parseInt(ttlValue)); } catch (NumberFormatException e) { logger.warn("Invalid TTL format: '" + ttlValue + "'"); } } // 0..1 pubDate element Element pubDate = channel.getChild("pubDate", defNS); if (pubDate != null) { chnl.setPubDate(ParserUtils.getDate(pubDate.getTextTrim())); } // 0..1 lastBuildDate element Element lastBuildDate = channel.getChild("lastBuildDate", defNS); if (lastBuildDate != null) { chnl.setLastBuildDate(ParserUtils.getDate(lastBuildDate.getTextTrim())); } // get category list // get list of <category> elements List listCategory = channel.getChildren("category", defNS); if (listCategory.size() < 1) { // fallback mechanism: get dc:category element listCategory = channel.getChildren("category", dcNS); } if (listCategory.size() > 0) { RecursiveHashtable<String> catTable = new RecursiveHashtable<String>(); // for each category, parse hierarchy for (Object aListCategory : listCategory) { RecursiveHashtable<String> currTable = catTable; Element elCategory = (Element) aListCategory; // get contents of category element String[] titles = elCategory.getTextNormalize().split("/"); for (String title : titles) { // tokenize category string to extract out hierarchy if (!currTable.containsKey(title)) { // if token does not exist in current map, add it with child Hashtable currTable.put(title, new RecursiveHashtable<String>()); } // reset current Hashtable to child's Hashtable then iterate to next token currTable = currTable.get(title); } } ArrayList<CategoryIF> catList = new ArrayList<CategoryIF>(); // transform cat list & hierarchy into list of CategoryIF elements Enumeration<String> enumCategories = catTable.keys(); while (enumCategories.hasMoreElements()) { String key = enumCategories.nextElement(); // build category list: getCategoryList(parent, title, children) CategoryIF cat = getCategoryList(null, key, catTable.get(key)); catList.add(cat); } if (catList.size() > 0) { // if categories were actually created, then add list to item node chnl.setCategories(catList); } } // 0..1 managingEditor element Element managingEditor = channel.getChild("managingEditor", defNS); if (managingEditor != null) { chnl.setCreator(managingEditor.getTextTrim()); } // 0..1 webMaster element Element webMaster = channel.getChild("webMaster", defNS); if (webMaster != null) { chnl.setPublisher(webMaster.getTextTrim()); } // 0..1 cloud element Element cloud = channel.getChild("cloud", defNS); if (cloud != null) { String _port = cloud.getAttributeValue("port", defNS); int port = -1; if (_port != null) { try { port = Integer.parseInt(_port); } catch (NumberFormatException e) { logger.warn(e); } } chnl.setCloud(cBuilder.createCloud(cloud.getAttributeValue("domain", defNS), port, cloud.getAttributeValue("path", defNS), cloud.getAttributeValue("registerProcedure", defNS), cloud.getAttributeValue("protocol", defNS))); } chnl.setLastUpdated(dateParsed); // 0..1 skipHours element // 0..1 skipDays element return chnl; }