The attached patch adds support for rel=nofollow. Links which specify
this are ignored. Any objections to committing this?
http://googleblog.blogspot.com/2005/01/preventing-comment-spam.html
Doug
Index: src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
===================================================================
--- src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (revision 326948)
+++ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (working copy)
@@ -113,6 +113,12 @@
+ "<h2>End\tthis\rmadness\n!</h2>\r\n"
+ " . . . ."
+ "</body> </html>"),
+
+ // test that <a rel=nofollow> links are not returned
+ new String("<html><head></head><body>"
+ + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
+ + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore </a>"
+ + "</body></html>"),
};
private static String[] testBaseHrefs= {
@@ -123,6 +129,7 @@
"http://www.nutch.org/frames/";,
"http://www.nutch.org/maps/";,
"http://www.nutch.org/whitespace/";,
+ "http://www.nutch.org//";,
};
private static final DocumentFragment testDOMs[]=
@@ -145,6 +152,7 @@
+ "one two three space here space there no space "
+ "one two two three three four put some text here and there. "
+ "End this madness ! . . . .",
+ "ignore ignore",
};
private static final String[] answerTitle= {
@@ -155,6 +163,7 @@
"my title",
"my title",
"my title",
+ "",
};
// note: should be in page-order
@@ -214,6 +223,8 @@
{
new Outlink("http://www.nutch.org/index.html";, "whitespace test"),
},
+ {
+ }
};
} catch (MalformedURLException e) {
Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (revision 326948)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (working copy)
@@ -306,13 +306,21 @@
NamedNodeMap attrs = node.getAttributes();
String target = null;
+ boolean noFollow = false;
for (int i= 0; i < attrs.getLength(); i++ ) {
- if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) {
- target = attrs.item(i).getNodeValue();
- break;
+ Node attr = attrs.item(i);
+ String attrName = attr.getNodeName();
+
+ if ("rel".equalsIgnoreCase(attrName) &&
+ "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+ noFollow = true;
}
+
+ if (params.attrName.equalsIgnoreCase(attrName)) {
+ target = attr.getNodeValue();
+ }
}
- if (target != null)
+ if (target != null && !noFollow)
try {
URL url = new URL(base, target);
outlinks.add(new Outlink(url.toString(),