Skip to content

Commit 1d94164

Browse files
Merge pull request #41 from commoncrawl/35-wat-html-lang-attributes
WAT extractor: add attributes of the <html> element as metadata,
2 parents 456635c + 8627773 commit 1d94164

File tree

3 files changed

+153
-0
lines changed

3 files changed

+153
-0
lines changed

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import java.util.ArrayList;
44
import java.util.HashMap;
55
import java.util.HashSet;
6+
import java.util.Iterator;
67
import java.util.Locale;
78
import java.util.Map;
89
import java.util.Set;
@@ -110,6 +111,8 @@ public class ExtractingParseObserver implements ParseObserver {
110111
extractors.put("AUDIO", new EmbedTagExtractor());
111112
extractors.put("TRACK", new EmbedTagExtractor());
112113
extractors.put("SOURCE", new EmbedTagExtractor());
114+
// language from HTML root element
115+
extractors.put("HTML", new HTMLTagExtractor());
113116

114117
globalHrefAttributes = new HashSet<String>();
115118
globalHrefAttributes.add("background");
@@ -604,6 +607,23 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
604607
}
605608
}
606609

610+
private static class HTMLTagExtractor implements TagExtractor {
611+
@Override
612+
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
613+
ArrayList<String> l = getAttrList(node, "lang", "xml:lang");
614+
if(l != null) {
615+
Iterator<String> it = l.iterator();
616+
while (it.hasNext()) {
617+
String name = it.next();
618+
if (it.hasNext()) {
619+
String lang = it.next();
620+
data.addMeta("name", makePath("HTML", name), "content", lang);
621+
}
622+
}
623+
}
624+
}
625+
}
626+
607627
private static class IFrameTagExtractor implements TagExtractor {
608628
@Override
609629
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import java.io.IOException;
44
import java.util.ArrayList;
55
import java.util.List;
6+
import java.util.Map;
67
import java.util.logging.Logger;
78

89
import org.archive.extract.ExtractingResourceFactoryMapper;
@@ -240,6 +241,20 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
240241
}
241242
}
242243

244+
private void checkExtractHtmlLangAttribute(Resource resource, String... langAttributes)
245+
throws JSONException {
246+
assertNotNull(resource);
247+
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
248+
JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
249+
assertNotNull(metas);
250+
JSONObject meta = metas.getJSONObject(0);
251+
for (int i = 0; i < langAttributes.length; i += 2) {
252+
String key = langAttributes[i];
253+
assertNotNull(meta.get(key));
254+
assertEquals(meta.get(key), langAttributes[i+1]);
255+
}
256+
}
257+
243258
public void testLinkExtraction() throws ResourceParseException, IOException {
244259
String testFileName = "link-extraction-test.warc";
245260
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
@@ -414,6 +429,18 @@ public void testTitleExtraction() throws ResourceParseException, IOException {
414429
checkTitle(resource, "Testing title extraction with embedded SVG");
415430
}
416431

432+
public void testHtmlLanguageAttributeExtraction() throws ResourceParseException, IOException {
433+
String testFileName = "html-lang-attribute.warc";
434+
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
435+
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
436+
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
437+
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
438+
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "zh-CN");
439+
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "cs-cz");
440+
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
441+
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/xml:lang", "content", "es-MX");
442+
}
443+
417444
public void testHtmlParserEntityDecoding() {
418445
String[][] entities = { //
419446
/* ampersand */
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
WARC/1.0
2+
WARC-Type: response
3+
WARC-Date: 2024-12-05T10:47:02Z
4+
Content-Length: 169
5+
Content-Type: application/http; msgtype=response
6+
WARC-Target-URI: https://www.example.org/1
7+
WARC-Identified-Payload-Type: text/html
8+
9+
HTTP/1.1 200
10+
content-type: text/html; charset=UTF-8
11+
12+
<!DOCTYPE html>
13+
<html lang="en">
14+
<head>
15+
<meta charset="UTF-8">
16+
<title>Test</title>
17+
</head>
18+
<body/>
19+
</html>
20+
21+
22+
23+
WARC/1.0
24+
WARC-Type: response
25+
WARC-Date: 2024-12-05T10:47:02Z
26+
Content-Length: 185
27+
Content-Type: application/http; msgtype=response
28+
WARC-Target-URI: https://www.example.org/2
29+
WARC-Identified-Payload-Type: text/html
30+
31+
HTTP/1.1 200
32+
content-type: text/html; charset=UTF-8
33+
34+
<!DOCTYPE html>
35+
<html lang="zh-CN" xmlns="http://www.w3.org/1999/xhtml">
36+
<head>
37+
<title>Test</title>
38+
</head>
39+
<body/>
40+
</html>
41+
42+
43+
44+
WARC/1.0
45+
WARC-Type: response
46+
WARC-Date: 2024-12-05T10:47:02Z
47+
Content-Length: 158
48+
Content-Type: application/http; msgtype=response
49+
WARC-Target-URI: https://www.example.org/3
50+
WARC-Identified-Payload-Type: text/html
51+
52+
HTTP/1.1 200
53+
content-type: text/html; charset=UTF-8
54+
55+
<!DOCTYPE html>
56+
<html dir="ltr" lang="cs-cz">
57+
<head>
58+
<title>Test</title>
59+
</head>
60+
<body/>
61+
</html>
62+
63+
64+
65+
WARC/1.0
66+
WARC-Type: response
67+
WARC-Date: 2024-12-05T10:47:02Z
68+
Content-Length: 319
69+
Content-Type: application/http; msgtype=response
70+
WARC-Target-URI: https://www.example.org/4
71+
WARC-Identified-Payload-Type: text/html
72+
73+
HTTP/1.1 200
74+
content-type: text/html; charset=UTF-8
75+
76+
<!DOCTYPE html>
77+
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr" style="overflow-x: hidden !important;">
78+
<head>
79+
<title>Test</title>
80+
</head>
81+
<body/>
82+
</html>
83+
84+
85+
86+
WARC/1.0
87+
WARC-Type: response
88+
WARC-Date: 2024-12-05T10:47:02Z
89+
Content-Length: 189
90+
Content-Type: application/http; msgtype=response
91+
WARC-Target-URI: https://www.example.org/5
92+
WARC-Identified-Payload-Type: text/html
93+
94+
HTTP/1.1 200
95+
content-type: text/html; charset=UTF-8
96+
97+
<!DOCTYPE html>
98+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="es-MX">
99+
<head>
100+
<title>Test</title>
101+
</head>
102+
<body/>
103+
</html>
104+
105+
106+

0 commit comments

Comments
 (0)