Java HTML Jsoup Document getCategoryIds(final Document html)

Here you can find the source of getCategoryIds(final Document html)

Description

get Category Ids

License

Open Source License

Declaration

static int[] getCategoryIds(final Document html) 

Method Source Code

//package com.java2s;
//License from project: Open Source License 

import java.util.Optional;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import java.util.stream.Stream;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

public class Main {
    static int[] getCategoryIds(final Document html) {
        // Category taken from breadcrumb, which Wowhead draws with JS :(
        final String breadcrumbData = html.getElementsByTag("script").stream().map(Element::data)
                .filter(data -> data.contains("PageTemplate.set({breadcrumb:")).findFirst().get();

        // ugh, parsing JS with regexes
        final String regex = Pattern.quote("PageTemplate.set({breadcrumb: [") + "([0-9,-]+)"
                + Pattern.quote("]});");
        final String[] categoryIds = getRegexGroup(breadcrumbData, regex, 1).get().split(",");
        return Stream.of(categoryIds).mapToInt(Integer::parseInt).toArray();
    }//from   w  w w. j a  v  a 2 s  .c o m

    static Optional<String> getRegexGroup(final String str, final String regex, final int group,
            final int... flags) {
        int combinedFlags = 0;

        for (int flag : flags) {
            combinedFlags |= flag;
        }

        final Matcher matcher = Pattern.compile(regex, combinedFlags).matcher(str);

        if (!matcher.find()) {
            return Optional.empty();
        }

        return Optional.of(matcher.group(group));
    }
}

Related

  1. convertLinksToAbsolute(String link, org.jsoup.nodes.Document doc)
  2. detectLanguage(Document doc)
  3. emptyDocument()
  4. formatDocument(Document doc)
  5. getAllText(Document document)
  6. getContainersForLink(Document document, String link)
  7. getDivForClass(Document document, String className)
  8. getDocument(CloseableHttpClient client, String url)
  9. getDocument(final String url)