How to read wikipedia dump without index file sequentially
Code
// WIKI DUMP FILE
File dumpFile = new File("/usr/local/wiki/enwiki/20230101/enwiki-20230101-pages-articles-multistream.xml.bz2");
try (WikiDumpReader dumpReader = new WikiDumpReader(dumpFile)) {
dumpReader.read(new WikiPageHandler() {
int count = 0;
@Override
public void read(WikiPage page) throws BreakException {
count++;
if (count > 30) {
throw new BreakException();
}
System.out.println(page.getTitle());
// System.out.println(page.getText());
}
});
} catch (BreakException be) {
}
Result
AccessibleComputing
Anarchism
AfghanistanHistory
AfghanistanGeography
AfghanistanPeople
AfghanistanCommunications
AfghanistanTransportations
AfghanistanMilitary
AfghanistanTransnationalIssues
AssistiveTechnology
AmoeboidTaxa
Autism
AlbaniaHistory
AlbaniaPeople
AsWeMayThink
AlbaniaGovernment
AlbaniaEconomy
Albedo
AfroAsiaticLanguages
ArtificalLanguages
AbacuS
AbalonE
AbbadideS
AbbesS
AbbevilleFrance
AbbeY
AbboT
Abbreviations
AtlasShrugged
ArtificialLanguages