Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package com.spd.ukraine.lucenewebsearch1.web; import com.spd.ukraine.lucenewebsearch1.model.WebPage; import com.spd.ukraine.lucenewebsearch1.service.WebPagesService; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.SocketTimeoutException; import java.net.URL; import java.nio.file.FileAlreadyExistsException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.StringTokenizer; import java.util.TreeMap; import java.util.TreeSet; import java.util.logging.Level; import java.util.logging.Logger; import javax.annotation.PostConstruct; import javax.validation.Valid; import org.apache.lucene.analysis.core.SimpleAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.stereotype.Controller; import org.springframework.validation.BindingResult; import org.springframework.validation.Errors; import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestMethod; import org.springframework.web.context.request.WebRequest; import org.springframework.web.servlet.ModelAndView; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.springframework.beans.factory.annotation.Autowired; /** * * @author sf */ @Controller public class IndexingController { private static final int MAX_HITS = 100; private static final String URL_FIELD = "url"; private static final String TITLE_FIELD = "title"; private static final String CONTENT_FIELD = "content"; private static final String HREF = "href"; private static final String SORTING_BY_RELEVANCE = "Sorting By Relevance"; private static final String SORTING_ALPHABETICALLY = "Sorting Alphabetically"; private static final boolean IS_DIRECTORY_IN_DISK = false; private static final int MAX_NUMBER_SITES_INDEXED = 300; private static final int RESULTS_PER_PAGE = 10; private static final String HIGHLIGHT_OPEN = "<b><span style='background-color:yellow'>"; private static final String HIGHLIGHT_CLOSE = "</span></b>"; private static final int MAX_WINDOW_SIZE = 500; private Integer MAX_RECURSION_SEARCH_NUMBER = 2; //static final private int lastPageResultIndex = RESULTS_PER_PAGE; private String sortingOrder = SORTING_BY_RELEVANCE; private File indexDir = null; private Directory directory = null; private StandardAnalyzer analyzer = null; private IndexWriter indexWriter = null; private final Set<String> referencedSites = new HashSet<>(); private final Set<String> referencedTitles = new HashSet<>(); List<WebPage> foundSearchResults = new ArrayList<>(); List<WebPage> foundSearchResultsSorted = new ArrayList<>(); // @Autowired // private WebPagesService webPagesService; @PostConstruct public void init() { if (IS_DIRECTORY_IN_DISK) { String userDirectory = System.getProperty("user.dir");// + "/lucene"; System.out.println("userDirectory " + userDirectory); Path userPath = Paths.get(userDirectory); Path rootPath = userPath.getRoot(); String workingDirectory = rootPath.toString() .concat(System.getProperty("file.separator").equals("/") ? userPath.subpath(0, 2).toString() + "/" : "\\Users\\sf\\") .concat("luceneindex"); System.out.println("workingDirectory " + workingDirectory); indexDir = new File(workingDirectory); try { Files.createDirectory(Paths.get(workingDirectory)); } catch (FileAlreadyExistsException ex) { System.out.println("FileAlreadyExistsException"); } catch (IOException ex) { // System.out.println("IOException: " + ex.getMessage()); ex.printStackTrace(); } if (null == indexDir) { return; } try { directory = FSDirectory.open(indexDir); } catch (IOException ex) { System.out.println("IOException: " + ex.getMessage()); } } else { directory = new RAMDirectory(); } analyzer = new StandardAnalyzer(Version.LUCENE_43);//new StandardAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer); try { indexWriter = new IndexWriter(directory, config); } catch (IOException ex) { // ex.printStackTrace(); // return; } } /** * Method used to create indexing for web page url to be entered. * * @return model of view */ @RequestMapping(value = { "/index", "/" }, method = RequestMethod.GET) public ModelAndView welcomePage() { System.out.println("welcomePage"); ModelAndView model = new ModelAndView(); WebPage q = new WebPage(); q.setTitle(MAX_RECURSION_SEARCH_NUMBER.toString()); model.addObject("q", q); model.setViewName("index"); return model; } /** * Method used to create indexing for entered web page content and web * pages referenced from given web page. * * @param webPage webPage.url == entered web-page url * @param result used to detect errors in form * @param request for future code * @param errors for future code * @return model of the success view or that of the create indexing page */ @RequestMapping(value = "/indexing", method = RequestMethod.POST) public ModelAndView startIndexing(@ModelAttribute("q") @Valid WebPage webPage, BindingResult result, WebRequest request, Errors errors) { System.out.println("start indexing q = " + webPage.getUrl()); int maxRecursion = MAX_RECURSION_SEARCH_NUMBER; try { maxRecursion = Integer.parseInt(webPage.getTitle()); } catch (NumberFormatException e) { } MAX_RECURSION_SEARCH_NUMBER = Math.abs(maxRecursion); WebPage created = new WebPage(); if (!result.hasErrors()) { System.out.println("!result.hasErrors()"); created = createWebPageRecord(webPage); } if (created == null) { System.out.println("created == null"); result.rejectValue("url", "label.not.reached.address"); } if (result.hasErrors()) { System.out.println("result.hasErrors()"); ModelAndView model = new ModelAndView("index"); model.addObject("q", webPage); return model; } else { ModelAndView model = new ModelAndView("root"); model.addObject("q", new WebPage()); return model; } } /** * Method used to initiate search of given keywords in data base. * * @param keywords keywords * @param result used to detect errors in form * @param request for future code * @param errors for future code * @return model of the success view or that of the registration page */ @RequestMapping(value = { "/search" }) //,method = RequestMethod.POST) public ModelAndView search(@ModelAttribute("q") WebPage keywords, BindingResult result, WebRequest request, Errors errors) { System.out.println("search for keywords q = " + keywords.getTitle()); foundSearchResults.clear(); foundSearchResultsSorted.clear(); if (!result.hasErrors()) { System.out.println("!result.hasErrors()"); try { foundSearchResults = new ArrayList<>(searchPhrase(keywords.getTitle(), CONTENT_FIELD)); } catch (IOException | ParseException ex) { Logger.getLogger(IndexingController.class.getName()).log(Level.SEVERE, null, ex); } } if (foundSearchResults.isEmpty()) { System.out.println("found.isEmpty()"); result.rejectValue("title", "label.not.found"); } if (result.hasErrors()) { System.out.println("result.hasErrors()"); ModelAndView model = new ModelAndView("root"); model.addObject("q", keywords); return model; } else { lastPageResultIndex = RESULTS_PER_PAGE; ModelAndView model = new ModelAndView("results"); model.addObject("found", returnSubListOfSortedList()); //foundSearchResults.subList(0, lastPageResultIndex)); model.addObject("sorting", returnInvertedSortingOrder()); return model; } } /** * Method used to redirect to page with next (RESULTS_PER_PAGE) results. * * @return model with next results */ @RequestMapping(value = "next") public ModelAndView nextResultsPage() { lastPageResultIndex += RESULTS_PER_PAGE; lastPageResultIndex = lastPageResultIndex <= foundSearchResults.size() ? lastPageResultIndex : foundSearchResults.size(); System.out.println("next " + lastPageResultIndex); ModelAndView model = new ModelAndView(); WebPage q = new WebPage(); q.setTitle(MAX_RECURSION_SEARCH_NUMBER.toString()); model.addObject("found", returnSubListOfSortedList()); model.addObject("sorting", returnInvertedSortingOrder()); model.setViewName("results"); return model; } /** * Method used to provide view with the selected sorting. * * changes sortingOrder, lastPageResultIndex, and foundSearchResultsSorted * @return model of the first page with new selected sorting */ @RequestMapping(value = "sorting") public ModelAndView sorting() { sortingOrder = returnInvertedSortingOrder(); // lastPageResultIndex = lastPageResultIndex <= foundSearchResults.size() // ? lastPageResultIndex : foundSearchResults.size(); lastPageResultIndex = RESULTS_PER_PAGE; System.out.println("next " + lastPageResultIndex); WebPage q = new WebPage(); q.setTitle(MAX_RECURSION_SEARCH_NUMBER.toString()); ModelAndView model = new ModelAndView(); model.addObject("sorting", returnInvertedSortingOrder()); model.addObject("found", returnSubListOfSortedList()); model.setViewName("results"); return model; } /** * Method used to create the record of given web page in search database. * * @param webPage webPage.url is entered url * @return webPage for success or null for fail */ private WebPage createWebPageRecord(WebPage webPage) { try { Document html = Jsoup.connect(webPage.getUrl()).get(); referencedSites.clear(); indexElements(webPage, html, 0); System.out.println(html.text()); System.out.println("number of indexed fields is " + indexWriter.numDocs()); // indexWriter.commit(); indexWriter.close(); return webPage; } catch (Exception ex) { System.out.println("createWebPageRecord " + ex.getMessage()); // ex.printStackTrace(); return null; } } /** * Method used to perform recursive creation indexing for a given web page * in search database. * * @param webPage webPage.url is entered url * webPage.title is set * @param html Jsoup.Document of entered url * @param recursionNumber used to stop recursion at exceeding * MAX_RECURSION_SEARCH_NUMBER */ private void indexElements(WebPage webPage, Document html, final int recursionNumber) throws IOException, ParseException { String title = html.title(); if (referencedTitles.contains(title.trim())) { return; } referencedTitles.add(title.trim()); webPage.setTitle(title); if (containsPage(webPage)) { System.out.println(webPage.getUrl() + " is already indexed"); return; } Element prevElement = null; Elements elements = html.body().getAllElements(); //.getElementsByTag("a"); addDoc(webPage, html.text()); // for (Element element : elements) { //// System.out.println(element.nodeName() + " element.text() " //// + element.text() + " url " //// + element.absUrl("href")); // if (element.nodeName().equalsIgnoreCase("body")) { // addDoc(webPage, element.text()); // break; //// continue; // } // if (null == prevElement) { // prevElement = element; //// } else if (prevElementContainsElementText(prevElement, element)) { //// continue; // } //// if (null !== webPagesService.findWebPage(element.absUrl("href"))) // if (element.text().trim().isEmpty()) { // continue; // } //// StringTokenizer str = new StringTokenizer(element.text()); //// str. // addDoc(webPage, element.text()); // } if (recursionNumber > MAX_RECURSION_SEARCH_NUMBER || referencedSites.size() > MAX_NUMBER_SITES_INDEXED) { // System.out.println(recursionNumber + " " // + referencedSites.contains(webPage.getUrl())); return; } elements.parallelStream() .filter((Element e) -> e.nodeName().equalsIgnoreCase("a") && null != e.absUrl(HREF) && !e.absUrl(HREF).trim().isEmpty() && !referencedSites.contains(e.absUrl(HREF)) && !referencedSites.contains(removeSharpEtc(e.absUrl(HREF)))) .forEach((Element element) -> { WebPage webPage1 = new WebPage(element.absUrl(HREF)); String url1 = webPage1.getUrl(); // System.out.println(recursionNumber + " recursion for '" // + url1 + "'"); try { Document htmlR = Jsoup.connect(url1).get(); indexElements(webPage1, htmlR, recursionNumber + 1); } catch (IOException | ParseException e) { System.out.println("Exception " + e.getMessage()); } referencedSites.add(url1); }); // for (Element element : elements) { // if (!element.nodeName().equalsIgnoreCase("a")) { // continue; // } // WebPage webPage1 = new WebPage(element.absUrl("href")); // if (null == webPage1.getUrl() // || webPage1.getUrl().isEmpty() // || referencedSites.contains(webPage1.getUrl())) { // continue; // } // System.out.println(recursionNumber + "recursion for " // + element.absUrl("href")); // try { // Document htmlR = Jsoup.connect(webPage1.getUrl()).get(); // webPage1.setTitle(htmlR.title()); // indexElements(webPage1, htmlR, recursionNumber + 1); // } catch (IOException e) { // System.out.println("IOException " + e.getMessage()); // } // referencedSites.add(webPage1.getUrl()); // } } private boolean prevElementContainsElementText(Element prevElement, Element element) { return (prevElement.hasText() && element.hasText() && prevElement.text().contains(element.text())); } /** * Method used to add documents to search database . * * @param webPage webPage.url is entered url; webPage.title is also added * @param text is also added */ private void addDoc(WebPage webPage, String text) throws IOException { org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); doc.add(new TextField(URL_FIELD, QueryParser.escape(webPage.getUrl()), Field.Store.YES)); doc.add(new TextField(TITLE_FIELD, webPage.getTitle(), Field.Store.YES)); doc.add(new TextField(CONTENT_FIELD, text, Field.Store.YES)); System.out.println("text '" + text + "'"); System.out.println("addDocument(doc) " + doc.get(URL_FIELD)); if (null == indexWriter) { IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer); System.out.println("config " + config); indexWriter = new IndexWriter(directory, config); } // System.out.println("indexWriter " + indexWriter); indexWriter.addDocument(doc); } public boolean containsPage(String url) throws IOException, ParseException { // searchIndex(indexDir, url, hits); // Directory directory = FSDirectory.open(indexDir);//.getDirectory(indexDir); try { System.out.println("directory.listAll() " + Arrays.toString(directory.listAll())); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(indexReader); QueryParser parser = new QueryParser(Version.LUCENE_43, URL_FIELD, new StandardAnalyzer(Version.LUCENE_43)); //new SimpleAnalyzer() org.apache.lucene.search.Query query = parser.parse(QueryParser.escape(url)); TopDocs topDocs = searcher.search(query, 1); ScoreDoc[] hits = topDocs.scoreDocs; System.out.println("hits.length " + hits.length); return hits.length > 0; } catch (org.apache.lucene.index.IndexNotFoundException ex) { return false; } } private boolean containsPage(WebPage webPage) { try { return !searchPhrase(webPage.getTitle(), TITLE_FIELD).isEmpty(); } catch (IOException | ParseException ex) { return false; } } /** * Method used to search phrase in search database. * * @param phrase * @param fieldName to search in it * @return Collection<WebPage> with search data saved in WebPage fields * @throws java.io.IOException * @throws org.apache.lucene.queryparser.classic.ParseException */ public Collection<WebPage> searchPhrase(String phrase, String fieldName) throws IOException, ParseException { if (null == phrase) { return new ArrayList<>(); } // searchIndex(indexDir, phrase, hits); // Directory directory = FSDirectory.open(indexDir);//.getDirectory(indexDir); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(indexReader); QueryParser parser = new QueryParser(Version.LUCENE_43, fieldName, new StandardAnalyzer(Version.LUCENE_43)); //new SimpleAnalyzer() org.apache.lucene.search.Query query = parser.parse(phrase); TopDocs topDocs = searcher.search(query, MAX_HITS); ScoreDoc[] hits = topDocs.scoreDocs; List<WebPage> searchResults = new ArrayList<>(); for (ScoreDoc hit : hits) { int docId = hit.doc; org.apache.lucene.document.Document d = searcher.doc(docId); System.out.println("'" + d.get(URL_FIELD) + "' '" + d.get(TITLE_FIELD) + "'"); // + " " + d.get("content")); WebPage webPage = new WebPage(); webPage.setUrl(d.get(URL_FIELD)); webPage.setTitle(highLightPhrase(d.get(TITLE_FIELD), phrase)); webPage.setContent(truncateText(d.get(CONTENT_FIELD), phrase)); searchResults.add(webPage); } System.out.println("Found " + hits.length); return new LinkedHashSet<>(searchResults); } private Object returnSubListOfSortedList() { List<WebPage> sortedList; if (sortingOrder.equals(SORTING_ALPHABETICALLY)) { if (foundSearchResultsSorted.isEmpty()) { foundSearchResultsSorted = new ArrayList<>(foundSearchResults); Collections.sort(foundSearchResultsSorted, (o1, o2) -> { String title1 = o1.getTitle().replace(HIGHLIGHT_OPEN, ""); String title2 = o2.getTitle().replace(HIGHLIGHT_OPEN, ""); return title1.compareToIgnoreCase(title2); }); } sortedList = foundSearchResultsSorted; // System.out.println("sortedList '" + sortedList.get(0).getTitle() // + "', '" + sortedList.get(1).getTitle()); } else { sortedList = foundSearchResults; } return sortedList.subList(lastPageResultIndex - RESULTS_PER_PAGE, Math.min(lastPageResultIndex, sortedList.size())); } private String returnInvertedSortingOrder() { return sortingOrder.equals(SORTING_ALPHABETICALLY) ? SORTING_BY_RELEVANCE : SORTING_ALPHABETICALLY; } private String highLightPhrase(String text, String phrase) { String lowerText = text.toLowerCase(); String lowerPhrase = phrase.toLowerCase(); StringBuilder newText = new StringBuilder(text); for (int i = lowerText.indexOf(lowerPhrase); i >= 0; i = lowerText.indexOf(lowerPhrase, i + HIGHLIGHT_OPEN.length() + phrase.length() + HIGHLIGHT_CLOSE.length() + 1)) { newText.insert(i, HIGHLIGHT_OPEN).insert(i + HIGHLIGHT_OPEN.length() + phrase.length(), HIGHLIGHT_CLOSE); lowerText = newText.toString().toLowerCase(); } return newText.toString(); } private String removeSharpEtc(String absUrl) { absUrl = absUrl.trim(); if (absUrl.contains("#")) { absUrl = absUrl.split("#")[0]; } if (absUrl.endsWith("index.html")) { absUrl = absUrl.replace("index.html", ""); } if (absUrl.startsWith("http")) { if (absUrl.startsWith("https")) { absUrl = absUrl.replaceFirst("https", "http"); } else { absUrl = absUrl.replaceFirst("http", "https"); } } return absUrl; } private String truncateText(String text, String phrase) { StringBuilder inputPhrase = new StringBuilder(); // try { // URL url = new URL(text); // BufferedReader in = new BufferedReader(new InputStreamReader(url // .openStream())); // String inputLine; // while ((inputLine = in.readLine()) != null) { // inputPhrase.append(inputLine); // if (inputPhrase.toString().contains(phrase) // && inputPhrase.length() > MAX_WINDOW_SIZE) { // break; // } // } // in.close(); // } catch (IOException ex) { // return ""; // } inputPhrase = new StringBuilder(text); String lowerText = text.toLowerCase(); int pos = lowerText.indexOf(phrase); try { inputPhrase.delete(0, Math.max(0, pos - MAX_WINDOW_SIZE)) .delete(Math.min(pos + MAX_WINDOW_SIZE, inputPhrase.length()), inputPhrase.length()); } catch (StringIndexOutOfBoundsException e) { System.out.println("pos - MAX_WINDOW_SIZE " + (pos - MAX_WINDOW_SIZE)); System.out.println("pos + MAX_WINDOW_SIZE " + (pos + MAX_WINDOW_SIZE) + " < " + inputPhrase.length()); System.out.println(e.getMessage() + " text " + text); } return highLightPhrase(inputPhrase.toString(), phrase); } }