/** * HostBrowser * Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany * First released 27.09.2012 at http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.BlockingQueue; import java.util.regex.Pattern; import org.apache.solr.common.SolrDocument; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailType; import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.peers.graphics.WebStructureGraph.StructureEntry; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Fulltext; import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.query.QueryParams; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; /** * Browser for indexed resources */ public class HostBrowser { final static long TIMEOUT = 10000L; public static enum StoreType { LINK, INDEX, EXCLUDED, FAILED, RELOAD; } /** * <p>Retrieve local index entries for a path, or for hosts with the most references. Also allow some maintaining operations on entries with load errors.</p> * <p>Some parameters need administrator authentication or unauthenticated local host requests to be allowed : load, deleteLoadErrors, delete, reload404, * hosts="crawling" and hosts="error". * The "load" parameter can also be applied without authentication when "browser.load4everyone" configuration setting is true.</p> * <p> * Configuration settings : * <ul> * <li>browser.autoload : allow the administrator to stack URLs to the local crawl queue, manually with the "load" parameter, * or automatically when the "path" parameter is filled with an unknown URL</li> * <li>browser.load4everyone : allow everyone to stack URLs to the local crawl queue. * "browser.autoload" has also to be set to true to enable automatic loading on an unknown path</li> * <li>publicSearchpage : set to false to restrict use of this servlet to authenticated administrator only</li> * <li>publicTopmenu : set to false to hide the top navigation bar to non authenticated users</li> * <li>decoration.hostanalysis : add supplementary hosts information for debug/analysis purpose</li> * <li>decoration.grafics.linkstructure : display a link structure graph when the path parameter is filled</li> * </ul> * </p> * @param header servlet request header * @param post request parameters. Supported keys :<ul> * <li>admin : when "true", display in the html page render the administration context (menu and top navbar)</li> * <li>path : root URL or host name to browse (ignored when the hosts parameter is filled). When not yet locally indexed, this URL can be automatically crawled and indexed * when "browser.autoload" or "browser.load4everyone" configuration settings are set to true.</li> * <li>load : URL to crawl and index.</li> * <li>deleteLoadErrors : delete from the local index documents with load error (HTTP status different from 200 or any other failure).</li> * <li>hosts : generate hosts with most references list. Supported values : * <ul> * <li>"crawling" : restrict to host currently crawled</li> * <li>"error" : restrict to hosts with having at least one resource load error</li> * </ul> * </li> * <li>delete : delete from the index whole documents tree matching the path prefix</li> * <li>reload404 : reload documents matching the path prefix and which previously failed to load due to a network error</li> * <li>facetcount : </li> * <li>complete : we want only root paths for complete lists</li> * <li>nepr :</li> * <li>showlinkstructure : when present, display a link graph for path</li> * </ul> * @param env server environment * @return the servlet answer object */ @SuppressWarnings({ "unchecked" }) public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; Fulltext fulltext = sb.index.fulltext(); final boolean authorized = sb.verifyAuthentication(header); final boolean autoload = authorized && sb.getConfigBool("browser.autoload", true); final boolean load4everyone = sb.getConfigBool("browser.load4everyone", false); final boolean loadRight = autoload || load4everyone; // add config later final boolean searchAllowed = sb.getConfigBool(SwitchboardConstants.PUBLIC_SEARCHPAGE, true) || authorized; final serverObjects prop = new serverObjects(); // set default values prop.put("path", ""); prop.put("result", ""); prop.put("hosts", 0); prop.put("files", 0); prop.put("hostanalysis", 0); prop.put("admin", "false"); boolean admin = false; String referer = header.get("Referer", ""); if ((post != null && post.getBoolean("admin")) || referer.contains("HostBrowser.html?admin=true")) { prop.put("topmenu", 2); prop.put("admin", "true"); admin = true; } else if (authorized) { // show top nav to admins prop.put("topmenu", 1); } else { // for other respect setting in Search Design Configuration prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0); } final String promoteSearchPageGreeting = (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) ? env.getConfig("network.unit.description", "") : env.getConfig(SwitchboardConstants.GREETING, ""); prop.put("topmenu_promoteSearchPageGreeting", promoteSearchPageGreeting); if (!searchAllowed) { prop.put("result", "You are not allowed to use this page. Please ask an administrator for permission."); prop.putNum("ucount", 0); return prop; } if(authorized) { /* Fill the "admin" parameter for authorized links */ prop.put("authorized_admin", Boolean.toString(admin)); } String path = post == null ? "" : post.get("path", "").trim(); if (authorized) sb.index.fulltext().commit(true); if (post == null || env == null) { prop.putNum("ucount", fulltext.collectionSize()); return prop; } int p = path.lastIndexOf('/'); if (p < 0 && path.length() > 0) path = path + "/"; else if (p > 7) path = path.substring(0, p + 1); // the search path shall always end with "/" if (path.length() > 0 && ( !path.startsWith("http://") && !path.startsWith("https://") && !path.startsWith("ftp://") && !path.startsWith("smb://") && !path.startsWith("file://"))) { path = "http://" + path; } prop.putHTML("path", path); prop.put("delete", authorized && path.length() > 0 ? 1 : 0); DigestURL pathURI = null; try {pathURI = new DigestURL(path);} catch (final MalformedURLException e) {} String load = post.get("load", ""); boolean wait = false; try { if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) { // in case that the url does not exist and loading is wanted turn this request into a loading request load = path; wait = true; } } catch (IOException e1) { load = path; wait = true; } if (load.length() > 0 && loadRight) { // stack URL DigestURL url; if (sb.crawlStacker.size() > 2) wait = false; try { url = new DigestURL(load); String reasonString = sb.crawlStacker.stackCrawl(new Request( sb.peers.mySeed().hash.getBytes(), url, null, load, new Date(), sb.crawler.defaultProxyProfile.handle(), 0, sb.crawler.defaultProxyProfile.timezoneOffset() )); prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString)); if (wait) waitloop: for (int i = 0; i < 30; i++) { try { if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break; } catch (IOException e1) { e1.printStackTrace(); break waitloop; } try {Thread.sleep(100);} catch (final InterruptedException e) {} } } catch (final MalformedURLException e) { prop.putHTML("result", "bad url '" + load + "'"); } } if (authorized && post.containsKey("deleteLoadErrors")) { try { fulltext.getDefaultConnector().deleteByQuery("-" + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); // make sure field exists ConcurrentLog.info ("HostBrowser:", "delete documents with httpstatus_i <> 200"); fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failtype_s.getSolrFieldName() + ":\"" + FailType.fail.name() + "\"" ); ConcurrentLog.info ("HostBrowser:", "delete documents with failtype_s = fail"); fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failtype_s.getSolrFieldName() + ":\"" + FailType.excl.name() + "\"" ); ConcurrentLog.info ("HostBrowser:", "delete documents with failtype_s = excl"); prop.putNum("ucount", fulltext.collectionSize()); return prop; } catch (final IOException ex) { ConcurrentLog.logException(ex); } } if (post.containsKey("hosts")) { // generate host list try { boolean onlyCrawling = "crawling".equals(post.get("hosts", "")); boolean onlyErrors = "error".equals(post.get("hosts", "")); int maxcount = authorized ? 2 * 3 * 2 * 5 * 7 * 2 * 3 : 360; // which makes nice matrixes for 2, 3, 4, 5, 6, 7, 8, 9 rows/colums // collect hosts from index ReversibleScoreMap<String> hostscore = fulltext.getDefaultConnector().getFacets(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", maxcount, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName()); if (hostscore == null) hostscore = new ClusteredScoreMap<String>(true); // collect hosts from crawler final Map<String, Integer[]> crawler = (authorized) ? sb.crawlQueues.noticeURL.getDomainStackHosts(StackType.LOCAL, sb.robots) : new HashMap<String, Integer[]>(); final Map<String, Integer> hostNameToPendingCount = new HashMap<>(); for(Entry<String, Integer[]>crawlerEntry: crawler.entrySet()) { /* The local stack returns keys composed of "hostname:port" : we now sum pending URLs counts by host name */ String hostName = Domains.stripToHostName(crawlerEntry.getKey()); Integer pendingCount = hostNameToPendingCount.get(hostName); if(pendingCount == null) { pendingCount = 0; } pendingCount += crawlerEntry.getValue()[0]; hostNameToPendingCount.put(hostName, pendingCount); } // collect the errorurls Map<String, ReversibleScoreMap<String>> exclfacets = authorized ? fulltext.getDefaultConnector().getFacets(CollectionSchema.failtype_s.getSolrFieldName() + ":" + FailType.excl.name(), maxcount, CollectionSchema.host_s.getSolrFieldName()) : null; ReversibleScoreMap<String> exclscore = exclfacets == null ? new ClusteredScoreMap<String>(true) : exclfacets.get(CollectionSchema.host_s.getSolrFieldName()); Map<String, ReversibleScoreMap<String>> failfacets = authorized ? fulltext.getDefaultConnector().getFacets(CollectionSchema.failtype_s.getSolrFieldName() + ":" + FailType.fail.name(), maxcount, CollectionSchema.host_s.getSolrFieldName()) : null; ReversibleScoreMap<String> failscore = failfacets == null ? new ClusteredScoreMap<String>(true) : failfacets.get(CollectionSchema.host_s.getSolrFieldName()); int c = 0; Iterator<String> i = hostscore.keys(false); String host; while (i.hasNext() && c < maxcount) { host = i.next(); prop.put("hosts_list_" + c + "_admin", admin ? "true" : "false"); prop.putHTML("hosts_list_" + c + "_host", host); boolean inCrawler = hostNameToPendingCount.containsKey(host); int exclcount = exclscore.get(host); int failcount = failscore.get(host); int errors = exclcount + failcount; prop.put("hosts_list_" + c + "_count", hostscore.get(host)); prop.put("hosts_list_" + c + "_crawler", inCrawler ? 1 : 0); if (inCrawler) { prop.put("hosts_list_" + c + "_crawler_pending", hostNameToPendingCount.get(host)); } prop.put("hosts_list_" + c + "_errors", errors > 0 ? 1 : 0); if (errors > 0) { prop.put("hosts_list_" + c + "_errors_exclcount", exclcount); prop.put("hosts_list_" + c + "_errors_failcount", failcount); } prop.put("hosts_list_" + c + "_type", inCrawler ? 2 : errors > 0 ? 1 : 0); if (onlyCrawling) { if (inCrawler) c++; } else if (onlyErrors) { if (errors > 0) c++; } else { c++; } } prop.put("hosts_list", c); prop.put("hosts_authorized", authorized ? 1 : 0); prop.put("hosts", 1); } catch (final IOException e) { ConcurrentLog.logException(e); } } if (path.length() > 0) { try { DigestURL uri = new DigestURL(path); String host = uri.getHost(); // write host analysis if path after host is empty if (uri.getPath().length() <= 1 && host != null && host.length() > 0 && sb.getConfigBool("decoration.hostanalysis", false)) { //how many documents per crawldepth_i; get crawldepth_i facet for host ArrayList<String> ff = new ArrayList<>(); for (CollectionSchema csf: CollectionSchema.values()) { if ((csf.getType() != SolrType.num_integer && csf.getType() != SolrType.num_long) || csf.isMultiValued()) continue; String facetfield = csf.getSolrFieldName(); if (!fulltext.getDefaultConfiguration().contains(facetfield)) continue; ff.add(csf.getSolrFieldName()); } // add also vocabulary counters Map<String, ReversibleScoreMap<String>> vocabularyFacet = sb.index.fulltext().getDefaultConnector().getFacets(CollectionSchema.vocabularies_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM, 100, CollectionSchema.vocabularies_sxt.getSolrFieldName()); if (vocabularyFacet.size() > 0) { Collection<String> vocnames = vocabularyFacet.values().iterator().next().keyList(true); for (String vocname: vocnames) { ff.add(CollectionSchema.VOCABULARY_PREFIX + vocname + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX); ff.add(CollectionSchema.VOCABULARY_PREFIX + vocname + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX); } } // list the facets String[] facetfields = ff.toArray(new String[ff.size()]); Map<String, ReversibleScoreMap<String>> facets = fulltext.getDefaultConnector().getFacets(CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"", 100, facetfields); int fc = 0; for (Map.Entry<String, ReversibleScoreMap<String>> facetentry: facets.entrySet()) { ReversibleScoreMap<String> facetfieldmap = facetentry.getValue(); if (facetfieldmap.size() == 0) continue; TreeMap<Long, Integer> statMap = new TreeMap<>(); for (String k: facetfieldmap) statMap.put(Long.parseLong(k), facetfieldmap.get(k)); prop.put("hostanalysis_facets_" + fc + "_facetname", facetentry.getKey()); int c = 0; for (Entry<Long, Integer> entry: statMap.entrySet()) { prop.put("hostanalysis_facets_" + fc + "_facet_" + c + "_key", entry.getKey()); prop.put("hostanalysis_facets_" + fc + "_facet_" + c + "_count", entry.getValue()); prop.put("hostanalysis_facets_" + fc + "_facet_" + c + "_a", "solr/collection1/select?q=host_s:" + host + " AND " + facetentry.getKey() + ":" + entry.getKey() + "&defType=edismax&start=0&rows=1000&fl=sku,crawldepth_i"); c++; } prop.put("hostanalysis_facets_" + fc + "_facet", c); fc++; } prop.put("hostanalysis_facets", fc); prop.put("hostanalysis", 1); } // write file list for subpath boolean delete = false; boolean reload404 = false; if (authorized && post.containsKey("delete")) { // delete the complete path!! That includes everything that matches with this prefix. delete = true; } if (authorized && post.containsKey("reload404")) { // try to re-load all urls that have load errors and matches with this prefix. reload404 = true; } int facetcount=post.getInt("facetcount", 0); boolean complete = post.getBoolean("complete"); if (complete) { // we want only root paths for complete lists p = path.indexOf('/', 10); if (p > 0) path = path.substring(0, p + 1); } prop.put("files_complete", complete ? 1 : 0); prop.put("files_complete_admin", admin ? "true" : "false"); prop.putHTML("files_complete_path", path); p = path.substring(0, path.length() - 1).lastIndexOf('/'); if (p < 8) { prop.put("files_root", 1); } else { prop.put("files_root", 0); prop.putHTML("files_root_path", path.substring(0, p + 1)); prop.put("files_root_admin", admin ? "true" : "false"); } // generate file list from path prop.putHTML("outbound_host", host); if (authorized) prop.putHTML("outbound_admin_host", host); //used for WebStructurePicture_p link prop.putHTML("inbound_host", host); String hosthash = uri.hosthash(); String[] pathparts = uri.getPaths(); // get all files for a specific host from the index StringBuilder q = new StringBuilder(); if (host == null) { if (path.startsWith("file://")) { q.append(CollectionSchema.url_protocol_s.getSolrFieldName()).append(":file"); } } else { q.append(CollectionSchema.host_s.getSolrFieldName()).append(":\"").append(host).append("\""); } if (pathparts.length > 0 && pathparts[0].length() > 0) { for (String pe: pathparts) { if (pe.length() > 0) q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(":\"").append(pe).append('\"'); } } else { if (facetcount > 1000 || post.containsKey("nepr")) { q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM); } } BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000, TIMEOUT, 100, 1, false, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.failreason_s.getSolrFieldName(), CollectionSchema.failtype_s.getSolrFieldName(), CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(), CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(), CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(), CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(), CollectionSchema.crawldepth_i.getSolrFieldName(), CollectionSchema.references_i.getSolrFieldName(), CollectionSchema.references_internal_i.getSolrFieldName(), CollectionSchema.references_external_i.getSolrFieldName(), CollectionSchema.references_exthosts_i.getSolrFieldName(), CollectionSchema.cr_host_chance_d.getSolrFieldName(), CollectionSchema.cr_host_norm_i.getSolrFieldName() ); SolrDocument doc; Set<String> storedDocs = new HashSet<String>(); Map<String, FailType> errorDocs = new HashMap<String, FailType>(); Set<String> inboundLinks = new HashSet<String>(); Map<String, ReversibleScoreMap<String>> outboundHosts = new HashMap<String, ReversibleScoreMap<String>>(); Map<String, InfoCacheEntry> infoCache = new HashMap<String, InfoCacheEntry>(); int hostsize = 0; final List<String> deleteIDs = new ArrayList<String>(); final Collection<String> reloadURLs = new ArrayList<String>(); final Set<String> reloadURLCollection = new HashSet<String>(); long timeoutList = System.currentTimeMillis() + TIMEOUT; long timeoutReferences = System.currentTimeMillis() + 6000; ReferenceReportCache rrCache = sb.index.getReferenceReportCache(); while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()); FailType error = errortype == null ? null : FailType.valueOf(errortype); String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), rrCache, doc, ids, System.currentTimeMillis() < timeoutReferences)); if (u.startsWith(path)) { if (delete) { deleteIDs.add(ids); } else { if (error == null) storedDocs.add(u); else { if (reload404 && error == FailType.fail) { ArrayList<String> collections = (ArrayList<String>) doc.getFieldValue(CollectionSchema.collection_sxt.getSolrFieldName()); if (collections != null) reloadURLCollection.addAll(collections); reloadURLs.add(u); } if (authorized) errorDocs.put(u, error); } } } else if (complete) { if (error == null) storedDocs.add(u); else { if (authorized) errorDocs.put(u, error); } } if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link if (error == null) { hostsize++; // collect inboundlinks to browse the host Iterator<String> links = URIMetadataNode.getLinks(doc, true); while (links.hasNext()) { u = links.next(); if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); } // collect referrer links links = URIMetadataNode.getLinks(doc, false); while (links.hasNext()) { u = links.next(); try { MultiProtocolURL mu = new MultiProtocolURL(u); if (mu.getHost() != null) { ReversibleScoreMap<String> lks = outboundHosts.get(mu.getHost()); if (lks == null) { lks = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator); outboundHosts.put(mu.getHost(), lks); } lks.set(u, u.length()); } } catch (final MalformedURLException e) {} } } if (System.currentTimeMillis() > timeoutList) break; } if (deleteIDs.size() > 0) sb.remove(deleteIDs); if (reloadURLs.size() > 0) { final Map<String, Pattern> cm = new LinkedHashMap<String, Pattern>(); for (String collection: reloadURLCollection) cm.put(collection, QueryParams.catchall_pattern); sb.reload(reloadURLs, cm.size() > 0 ? cm : CrawlProfile.collectionParser("user"), false); } // collect from crawler List<Request> domainStackReferences = (authorized) ? sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000) : new ArrayList<Request>(0); Set<String> loadingLinks = new HashSet<String>(); for (Request crawlEntry: domainStackReferences) loadingLinks.add(crawlEntry.url().toNormalform(true)); // now combine all lists into one Map<String, StoreType> files = new HashMap<String, StoreType>(); for (String u: storedDocs) files.put(u, StoreType.INDEX); for (Map.Entry<String, FailType> e: errorDocs.entrySet()) files.put(e.getKey(), e.getValue() == FailType.fail ? StoreType.FAILED : StoreType.EXCLUDED); for (String u: inboundLinks) if (!files.containsKey(u)) files.put(u, StoreType.LINK); for (String u: loadingLinks) if (u.startsWith(path) && !files.containsKey(u)) files.put(u, StoreType.LINK); ConcurrentLog.info("HostBrowser", "collected " + files.size() + " urls for path " + path); // distinguish files and folders Map<String, Object> list = new TreeMap<String, Object>(); // a directory list; if object is boolean, its a file; if its a int[], then its a folder int pl = path.length(); String file; for (Map.Entry<String, StoreType> entry: files.entrySet()) { if (entry.getKey().length() < pl) continue; // this is not inside the path if (!entry.getKey().startsWith(path)) continue; file = entry.getKey().substring(pl); StoreType type = entry.getValue(); p = file.indexOf('/'); if (p < 0) { // this is a file list.put(entry.getKey(), type); // StoreType value: this is a file; true -> file is in index; false -> not in index, maybe in crawler } else { // this is a directory path or a file in a subdirectory String remainingPath = file.substring(0, p + 1); if (complete && remainingPath.indexOf('.') > 0) { list.put(entry.getKey(), type); // StoreType value: this is a file } else { String dir = path + remainingPath; Object c = list.get(dir); if (c == null) { int[] linkedStoredIncrawlerError = new int[]{0,0,0,0,0}; if (type == StoreType.LINK) linkedStoredIncrawlerError[0]++; if (type == StoreType.INDEX) linkedStoredIncrawlerError[1]++; if (loadingLinks.contains(entry.getKey())) linkedStoredIncrawlerError[2]++; if (errorDocs.containsKey(entry.getKey())) linkedStoredIncrawlerError[errorDocs.get(entry.getKey()) == FailType.excl ? 3 : 4]++; list.put(dir, linkedStoredIncrawlerError); } else if (c instanceof int[]) { if (type == StoreType.LINK) ((int[]) c)[0]++; if (type == StoreType.INDEX) ((int[]) c)[1]++; if (loadingLinks.contains(entry.getKey())) ((int[]) c)[2]++; if (errorDocs.containsKey(entry.getKey())) ((int[]) c)[errorDocs.get(entry.getKey()) == FailType.excl ? 3 : 4]++; } } } } int maxcount = 1000; int c = 0; // first list only folders int filecounter = 0; for (Map.Entry<String, Object> entry: list.entrySet()) { if ((entry.getValue() instanceof StoreType)) { filecounter++; } else { // this is a folder prop.put("files_list_" + c + "_type", 1); prop.putHTML("files_list_" + c + "_type_url", entry.getKey()); prop.putHTML("files_list_" + c + "_type_admin", admin ? "true" : "false"); int linked = ((int[]) entry.getValue())[0]; int stored = ((int[]) entry.getValue())[1]; int crawler = ((int[]) entry.getValue())[2]; int excl = ((int[]) entry.getValue())[3]; int error = ((int[]) entry.getValue())[4]; prop.put("files_list_" + c + "_type_stored", stored); prop.put("files_list_" + c + "_type_linked", linked); prop.put("files_list_" + c + "_type_pendingVisible", crawler > 0 ? 1 : 0); prop.put("files_list_" + c + "_type_pending", crawler); prop.put("files_list_" + c + "_type_excludedVisible", excl > 0 ? 1 : 0); prop.put("files_list_" + c + "_type_excluded", excl); prop.put("files_list_" + c + "_type_failedVisible", error > 0 ? 1 : 0); prop.put("files_list_" + c + "_type_failed", error); if (++c >= maxcount) break; } } // then list files for (Map.Entry<String, Object> entry: list.entrySet()) { if (entry.getValue() instanceof StoreType) { // this is a file prop.put("files_list_" + c + "_type", 0); prop.putHTML("files_list_" + c + "_type_url", entry.getKey()); prop.putHTML("files_list_" + c + "_type_admin", admin ? "true" : "false"); StoreType type = (StoreType) entry.getValue(); try {uri = new DigestURL(entry.getKey());} catch (final MalformedURLException e) {uri = null;} HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash()); // todo: cannot identify errors boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS); boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED; boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/"); if (!dc) { prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : error ? 3 : loading ? 2 : 0 /*linked*/); if (type == StoreType.INDEX) { String ids = ASCII.String(uri.hash()); InfoCacheEntry ice = infoCache.get(ids); prop.put("files_list_" + c + "_type_stored_comment", ice == null ? "" : ice.toString()); // ice.toString() contains html, therefore do not use putHTML here } prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0); if (error) { FailType failType = errorDocs.get(entry.getKey()); if (failType == null) { // maybe this is only in the errorURL //Metadata faildoc = sb.index.fulltext().getDefaultConnector().getMetadata(ASCII.String(uri.hash())); prop.putHTML("files_list_" + c + "_type_stored_error", "unknown error"); } else { String ids = ASCII.String(uri.hash()); InfoCacheEntry ice = infoCache.get(ids); prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail" + (ice == null ? "" : "; " + ice.toString())); } } if (loadRight) { prop.putHTML("files_list_" + c + "_type_stored_load_url", entry.getKey()); prop.putHTML("files_list_" + c + "_type_stored_load_path", path); prop.putHTML("files_list_" + c + "_type_stored_load_admin", Boolean.toString(admin)); } if (++c >= maxcount) break; } } } prop.put("files_list", c); prop.putHTML("files_path", path); prop.put("files_hostsize", hostsize); prop.put("files_subpathloadsize", storedDocs.size()); prop.put("files_subpathdetectedsize", filecounter - storedDocs.size()); prop.put("files", 1); uri = new DigestURL(path); if (post.containsKey("showlinkstructure")) { sb.setConfig(SwitchboardConstants.DECORATION_GRAFICS_LINKSTRUCTURE, true); } prop.put("files_linkgraph", uri.getPath().length() <= 1 && hostsize > 0 && sb.getConfigBool(SwitchboardConstants.DECORATION_GRAFICS_LINKSTRUCTURE, true)); prop.put("files_linkgraph_host", uri.getHost()); // generate inbound-links table StructureEntry struct = sb.webStructure.incomingReferences(hosthash); if (struct != null && struct.references.size() > 0) { maxcount = 200; ReversibleScoreMap<String> score = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator); for (Map.Entry<String, Integer> entry: struct.references.entrySet()) score.set(entry.getKey(), entry.getValue()); c = 0; Iterator<String> i = score.keys(false); while (i.hasNext() && c < maxcount) { host = i.next(); prop.put("inbound_list_" + c + "_admin", admin ? "true" : "false"); prop.putHTML("inbound_list_" + c + "_host", sb.webStructure.hostHash2hostName(host)); prop.put("inbound_list_" + c + "_count", score.get(host)); c++; } prop.put("inbound_list", c); prop.put("inbound", 1); } else { prop.put("inbound", 0); } // generate outbound-links table if (outboundHosts.size() > 0) { maxcount = 200; ReversibleScoreMap<String> score = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator); for (Map.Entry<String, ReversibleScoreMap<String>> entry: outboundHosts.entrySet()) score.set(entry.getKey(), entry.getValue().size()); c = 0; Iterator<String> i = score.keys(false); while (i.hasNext() && c < maxcount) { host = i.next(); prop.putHTML("outbound_list_" + c + "_host", host); prop.put("outbound_list_" + c + "_count", score.get(host)); prop.put("outbound_list_" + c + "_link", outboundHosts.get(host).getMinKey()); prop.put("outbound_list_" + c + "_admin", admin ? "true" : "false"); c++; } prop.put("outbound_list", c); prop.put("outbound", 1); } else { prop.put("outbound", 0); } } catch (final Throwable e) { ConcurrentLog.logException(e); } } // return rewrite properties prop.putNum("ucount", fulltext.collectionSize()); return prop; } public static final class InfoCacheEntry { public Integer cr_n; public Double cr_c; public int crawldepth, references, references_internal, references_external, references_exthosts; public List<String> references_internal_urls, references_external_urls; public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, boolean fetchReferences) { this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName()); this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName()); Integer cr = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName()); Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); this.crawldepth = (cr == null || cr.intValue() < 0) ? 0 : cr.intValue(); // for lazy value storage; non-existent means: stored as '0' this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue(); this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue(); // calculate the url reference list this.references_internal_urls = new ArrayList<String>(); this.references_external_urls = new ArrayList<String>(); if (fetchReferences) { // get the references from the citation index try { ReferenceReport rr = rrCache.getReferenceReport(urlhash, false); List<String> internalIDs = new ArrayList<String>(); List<String> externalIDs = new ArrayList<String>(); HandleSet iids = rr.getInternallIDs(); for (byte[] b: iids) internalIDs.add(ASCII.String(b)); HandleSet eids = rr.getExternalIDs(); for (byte[] b: eids) externalIDs.add(ASCII.String(b)); // get all urls from the index and store them here for (String id: internalIDs) { if (id.equals(urlhash)) continue; // no self-references DigestURL u = fulltext.getURL(id); if (u != null) references_internal_urls.add(u.toNormalform(true)); } for (String id: externalIDs) { if (id.equals(urlhash)) continue; // no self-references DigestURL u = fulltext.getURL(id); if (u != null) references_external_urls.add(u.toNormalform(true)); } } catch (final IOException e) { } } this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue(); this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue(); } @Override public String toString() { StringBuilder sbi = new StringBuilder(); int c = 0; for (String s: references_internal_urls) { sbi.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>"); c++; if (c % 80 == 0) sbi.append("<br/>"); } if (sbi.length() > 0) sbi.insert(0, "<br/>internal referrer:"); StringBuilder sbe = new StringBuilder(); c = 0; for (String s: references_external_urls) { sbe.append("<a href='").append(s).append("' target='_blank'><img src='env/grafics/i16.gif' alt='info' title='" + s + "' width='12' height='12'/></a>"); c++; if (c % 80 == 0) sbe.append("<br/>"); } if (sbe.length() > 0) sbe.insert(0, "<br/>external referrer:"); return (this.crawldepth == 998 ? "unknown crawldepth" : this.crawldepth >= 0 ? "crawldepth: " + this.crawldepth : "") + (this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") + (this.cr_n != null ? ", crn=" + this.cr_n : "") + (this.references >= 0 ? ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() : ""); } } }