/** * URLRewriterLibrary * Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany * first published 08.10.2012 on http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.crawler.retrieval; import java.io.File; import java.util.HashMap; import java.util.Map; import java.util.concurrent.BlockingQueue; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.yacy.cora.storage.Files; import net.yacy.cora.util.ConcurrentLog; public class URLRewriterLibrary { private final static ConcurrentLog log = new ConcurrentLog(URLRewriterLibrary.class.getName()); private final File rewritingPath; private final Map<Pattern, String> rewriters; public URLRewriterLibrary(final File rewritingPath) { this.rewriters = new HashMap<Pattern, String>(); this.rewritingPath = rewritingPath; if (this.rewritingPath == null || !this.rewritingPath.exists()) { return; } final String[] files = this.rewritingPath.list(); for (final String f: files) { File ff = new File(this.rewritingPath, f); try { BlockingQueue<String> list = Files.concurentLineReader(ff); String line; while ((line = list.take()) != Files.POISON_LINE) { line = line.trim(); if (line.length() == 0 || line.charAt(0) == '#') continue; if (!line.startsWith("s/")) { int p = line.indexOf('='); if (p < 0) p = line.indexOf(':'); if (p > 0) try { this.rewriters.put(Pattern.compile(line.substring(0, p)), line.substring(p + 1)); } catch (final PatternSyntaxException e) { log.warn("bad pattern: " + line.substring(0, p)); } } } } catch (final Throwable e) { log.warn("cannot read stemming file " + f, e); } } } public URLRewriterLibrary() { this.rewriters = new HashMap<Pattern, String>(); this.rewritingPath = null; } public String apply(String s) { if (this.rewriters == null || this.rewriters.size() == 0) return s; for (Map.Entry<Pattern, String> entry: this.rewriters.entrySet()) { Matcher m = entry.getKey().matcher(s); if (m.matches()) s = m.replaceAll(entry.getValue()); } return s; } public static void main(String[] args) { URLRewriterLibrary lib = new URLRewriterLibrary(); lib.rewriters.put(Pattern.compile("cln_\\d+\\/"), ""); // www.bund.de lib.rewriters.put(Pattern.compile("&administration=[0-9a-z]*"), ""); // http://www.lichtenau.de/ lib.rewriters.put(Pattern.compile("\\?administration=[0-9a-z]*"), ""); // http://www.lichtenau.de/ lib.rewriters.put(Pattern.compile("\\(X\\([1]\\"), ""); // herzogenrath lib.rewriters.put(Pattern.compile("\\(S\\([0-9a-z]+\\)\\)\\/"), ""); // herzogenrath lib.rewriters.put(Pattern.compile("&ccm=[0-9]*"), ""); // herne lib.rewriters.put(Pattern.compile("&sid=[0-9]{14}.{8}"), ""); // startercenter String s = ""; Pattern p = Pattern.compile("a"); s = p.matcher(s).replaceAll("b"); } }