Java tutorial
/* * Copyright (c) <2016> <Oleksandr Tyshkovets> * * This file is part of uchan-analyzer. * * uchan-analyzer is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * uchan-analyzer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with uchan-analyzer. If not, see <http://www.gnu.org/licenses/>. */ package com.github.aint.uchananalyzer; import com.github.aint.uchananalyzer.Post.Board; import com.google.gson.GsonBuilder; import com.google.gson.JsonArray; import com.google.gson.JsonObject; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.FileWriter; import java.io.IOException; import java.io.Writer; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.Collection; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; /** * @author Oleksandr Tyshkovets */ public class Parser { private static final int LAST_PAGE = 2700; private static final String UCHAN_LP_URL = "http://uchan.to/lp/"; private static final String UCHAN_LP_PAGE_URL = "http://uchan.to/lp/index.php?board=all&page="; private static final String TITLE_ATTRIBUTE = "title"; private static final String TITLE_ATTRIBUTE_VALUE = ":"; private static final String POST_ATTRIBUTE = "id"; private static final String POST_ATTRIBUTE_VALUE = "post_text_"; private static final String REPLY_CLASS = "reply"; private static final String FILE_SIZE_CLASS = "filesize"; private static final String P_TAG = "p"; private static final Pattern PAT = Pattern.compile("\\d{4}\\.\\d{2}\\.\\d{2}\\s\\s\\d{2}:\\d{2}"); public static final String NO_DATE_FOUND = "No date found"; public static final String COMMENT_POSTER_NAME_CLASS = "commentpostername"; public static final String FONT_SIZE_ATTRIBUTE = "size"; public static final String FONT_SIZE_ATTRIBUTE_VALUE = "4"; public static final String REMOVE_POST_ID_REFERENCE_REGEX = ">>\\d{1,}"; public static final String YOUTUBE_AS_ATTACHED_FILE_PATTERN = ": -(0 B, 0x0)"; public static final String REFLINK_CLASS = "reflink"; public static final String HREF_ATTRIBUTE = "href"; public static void main(String[] args) throws IOException { String lastPage = getLastPage(Jsoup.connect(UCHAN_LP_URL).get()); Document page = Jsoup.connect(UCHAN_LP_PAGE_URL + lastPage).get(); // Set<String> topics = new HashSet<>(getThreadTopics(page)); // topics.forEach(System.out::println); // save2Json(topics); List<Post> posts = page.getElementsByClass(REPLY_CLASS).stream() .map(element -> new Post(getPostAuthor(element), getPostBoard(element), getPostText(element), getPostDate(element), "", isPostHasImage(element))) .collect(Collectors.toList()); posts.forEach(System.out::println); save2Json(posts, lastPage); } private static String getPostAuthor(Element element) { return element.getElementsByClass(COMMENT_POSTER_NAME_CLASS).text(); } private static LocalDateTime getPostDate(Element element) { String date = element.getElementsByTag("label").text(); Matcher matcher = PAT.matcher(date); if (!matcher.find()) { throw new IllegalStateException(NO_DATE_FOUND); } return LocalDateTime.parse(matcher.group(), DateTimeFormatter.ofPattern("yyyy.MM.dd HH:mm")); } private static String getPostText(Element element) { return element.getElementsByAttributeValueContaining(POST_ATTRIBUTE, POST_ATTRIBUTE_VALUE).stream() .map(e -> e.getElementsByTag(P_TAG).text()) .map(s -> s.replaceAll(REMOVE_POST_ID_REFERENCE_REGEX, "")).filter(s -> !s.isEmpty()) .collect(Collectors.joining()); } private static Board getPostBoard(Element element) { String postLink = element.getElementsByClass(REFLINK_CLASS).first().children().first().attr(HREF_ATTRIBUTE); System.out.println(postLink); String board = postLink.split("/")[1].toUpperCase(); System.out.println(board); return Board.valueOf(board); } private static boolean isPostHasImage(Element element) { Elements fileElement = element.getElementsByClass(FILE_SIZE_CLASS); return !fileElement.isEmpty() && !YOUTUBE_AS_ATTACHED_FILE_PATTERN.equals(fileElement.text()); } private static void save2Json(Collection<Post> posts, String pageNumber) { JsonArray array = new JsonArray(); for (Post post : posts) { JsonObject innerJson = new JsonObject(); innerJson.addProperty("author", post.getAuthor()); innerJson.addProperty("text", post.getText()); innerJson.addProperty("date", post.getDate().toString()); innerJson.addProperty("image", post.isHasImage()); array.add(innerJson); } JsonObject json = new JsonObject(); json.add(pageNumber, array); try (Writer writer = new FileWriter(pageNumber + ".json")) { new GsonBuilder().setPrettyPrinting().create().toJson(json, writer); } catch (IOException e) { e.printStackTrace(); } } private static List<String> getThreadTopics(Document page) { Elements topics = page.getElementsByAttributeValueContaining(TITLE_ATTRIBUTE, TITLE_ATTRIBUTE_VALUE); return topics.stream().map(t -> t.attr("title")).collect(Collectors.toList()); } private static String getLastPage(Document page) { String lastPage = page.getElementsByAttributeValue(FONT_SIZE_ATTRIBUTE, FONT_SIZE_ATTRIBUTE_VALUE).text(); return lastPage.substring(lastPage.lastIndexOf(" ") + 1); } }