Java tutorial
/* * Copyright 2014 Gleb Godonoga. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.andrada.sitracker.reader; import com.andrada.sitracker.Constants; import com.andrada.sitracker.db.beans.Author; import com.andrada.sitracker.db.beans.Publication; import com.andrada.sitracker.exceptions.AddAuthorException; import com.andrada.sitracker.util.SamlibPageHelper; import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class SamlibAuthorPageReader implements AuthorPageReader { private String pageContent; public SamlibAuthorPageReader(String page) { this.pageContent = this.sanitizeHTML(page); } @NotNull @Override public Author getAuthor(@NotNull String url) throws AddAuthorException { Author author = new Author(); author.setUrl(url); String urlId = SamlibPageHelper.getUrlIdFromCompleteUrl(url); author.setUrlId(urlId); author.setName(getAuthorName()); author.setUpdateDate(getAuthorUpdateDate()); author.setAuthorDescription(getAuthorDescription()); author.setAuthorImageUrl(getAuthorImageUrl(url)); return author; } @NotNull @Override public List<Publication> getPublications(@NotNull Author author) { ArrayList<Publication> publicationList = new ArrayList<Publication>(); Pattern pattern = Pattern.compile(Constants.PUBLICATIONS_REGEX, Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(pageContent); while (matcher.find()) { Publication item = new Publication(); String baseUrl = author.getUrl().replace(Constants.AUTHOR_PAGE_URL_ENDING_WI_SLASH, ""); baseUrl = baseUrl.replace(Constants.AUTHOR_PAGE_ALT_URL_ENDING_WI_SLASH, ""); item.setAuthor(author); item.setUpdateDate(new Date()); //Group 1 - LinkToText String itemURL = matcher.group(3) == null ? "" : matcher.group(3); item.setUrl(baseUrl + "/" + itemURL); //Group 2 - NameOfText String itemTitle = matcher.group(4) == null ? "" : matcher.group(4); item.setName(escapeHTML(itemTitle)); //Group 3 - SizeOfText String sizeOfText = matcher.group(5) == null ? "0" : matcher.group(5); item.setSize(Integer.parseInt(sizeOfText)); //Group 4 - DescriptionOfRating String descriptionOfRating = matcher.group(6) == null ? "" : matcher.group(6); item.setRating(escapeHTML(descriptionOfRating)); //Group 5 - Rating String rating = matcher.group(7) == null ? "0" : matcher.group(7); //Group 6 - Section String categoryName = matcher.group(8) == null ? "" : matcher.group(8); item.setCategory(escapeHTML(categoryName).replace("@", "")); //Group 7 - Genres String genre = matcher.group(9) == null ? "" : matcher.group(9); //Group 8 - Link to Comments String commentsUrl = matcher.group(10) == null ? "" : matcher.group(10); item.setCommentUrl(commentsUrl); //Group 9 - CommentsDescription String commentsDescription = matcher.group(11) == null ? "" : matcher.group(11); //Group 10 - CommentsCount String commentsCount = matcher.group(12) == null ? "0" : matcher.group(12); item.setCommentsCount(Integer.parseInt(commentsCount)); //Group 11 - Description String itemDescription = matcher.group(13) == null ? "" : matcher.group(13); item.setDescription(itemDescription.trim()); item.setImageUrl(extractImage(itemDescription.trim())); String imagesPageUrl = matcher.group(14) == null ? null : matcher.group(14); item.setImagePageUrl(imagesPageUrl); publicationList.add(item); } return publicationList; } @Nullable @Override public String getAuthorImageUrl(String authorUrl) { authorUrl = authorUrl.replace(Constants.AUTHOR_PAGE_URL_ENDING_WO_SLASH, ""); authorUrl = authorUrl.replace(Constants.AUTHOR_PAGE_ALT_URL_ENDING_WO_SLASH, ""); Pattern pattern = Pattern.compile(Constants.AUTHOR_IMAGE_REGEX, Pattern.MULTILINE); Matcher matcher = pattern.matcher(pageContent); String imageUrl = null; if (matcher.find()) { imageUrl = (matcher.group(2)); if (imageUrl != null) imageUrl = authorUrl + imageUrl; } return imageUrl; } @Nullable @Override public String getAuthorDescription() { Pattern pattern = Pattern.compile(Constants.AUTHOR_DESCRIPTION_TEXT_REGEX, Pattern.MULTILINE); Matcher matcher = pattern.matcher(pageContent); String descriptionText = null; if (matcher.find()) { descriptionText = (matcher.group(1)); } return descriptionText; } @Override public boolean isPageBlank() { return pageContent == null || pageContent.length() == 0; } private String sanitizeHTML(String value) { String[] tokensToReplace = new String[] { "<br />", "<BR />", "•", "‹", "›", "™", "⁄", "<", "<", ">", ">", "©", "©", " ", "&NBSP;", """, "\r", "\n", "\f" }; String[] replacements = new String[] { "<br>", "<br>", " * ", "<", ">", "(tm)", "/", "<", "<", ">", ">", "(c)", "(c)", " ", " ", "\"", "", "", "" }; value = StringUtils.replaceEachRepeatedly(value, tokensToReplace, replacements); return value; } private static String escapeHTML(String value) { value = value.replaceAll("(?si)[\\r\\n\\x85\\f]+", "").replaceAll("(?i)<(br|li)[^>]*>", "\n") .replaceAll("(?i)<td[^>]*>", "\t").replaceAll("(?si)<script[^>]*>.*?</\\s*script[^>]*>", "") .replaceAll("<[^>]*>", "").replaceAll("(?si)\\n[\\p{Z}\\t]+\\n", "\n\n") .replaceAll("(?si)\\n\\n+", "\\n\\n"); return value; } @Nullable private static String extractImage(@NotNull String itemDescription) { String imgUrl = null; Pattern pattern = Pattern.compile("(<a[^>]*>)?\\s*?<img src=[\"'](.*?)[\"'][^>]*>\\s?(</a>)?"); Matcher matcher = pattern.matcher(itemDescription); if (matcher.find()) { String match = matcher.group(2); if (match != null) { imgUrl = match.trim(); } } return imgUrl; } @NotNull private String getAuthorName() throws AddAuthorException { int index = pageContent.indexOf('.', pageContent.indexOf("<title>")) + 1; if (index == -1) { throw new AddAuthorException(AddAuthorException.AuthorAddErrors.AUTHOR_NAME_NOT_FOUND); } int secondPointIndex = pageContent.indexOf(".", index); if (secondPointIndex == -1) { throw new AddAuthorException(AddAuthorException.AuthorAddErrors.AUTHOR_NAME_NOT_FOUND); } String authorName = pageContent.substring(index, secondPointIndex); if ("".equals(authorName.trim())) { throw new AddAuthorException(AddAuthorException.AuthorAddErrors.AUTHOR_NAME_NOT_FOUND); } return authorName; } private Date getAuthorUpdateDate() throws AddAuthorException { Pattern pattern = Pattern.compile(Constants.AUTHOR_UPDATE_DATE_REGEX, Pattern.MULTILINE); Matcher matcher = pattern.matcher(pageContent); Date date = new Date(); if (matcher.find()) { SimpleDateFormat ft = new SimpleDateFormat(Constants.AUTHOR_UPDATE_DATE_FORMAT); try { date = ft.parse(matcher.group(1)); } catch (ParseException e) { throw new AddAuthorException(AddAuthorException.AuthorAddErrors.AUTHOR_DATE_NOT_FOUND); } } return date; } }