aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/enwiki/gen_desc_data.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2023-01-02 14:51:53 +1100
committerTerry Truong <terry06890@gmail.com>2023-01-02 14:51:53 +1100
commit56369bccd977ac726bef70895883e79da4e1edd8 (patch)
tree67a894fe1579f2da150f0162ccbdc8a0a19ef9be /backend/hist_data/enwiki/gen_desc_data.py
parent0e5e46cedaaeacf59cfd0f2e30c1ae6923466870 (diff)
Adjust wikidata event specifiers
Do minor refactors: - Swap fmt=1 and fmt=2 in 'events' table - Make documentation consistently use BC and AD - import argparse at start of scripts
Diffstat (limited to 'backend/hist_data/enwiki/gen_desc_data.py')
-rwxr-xr-xbackend/hist_data/enwiki/gen_desc_data.py7
1 files changed, 3 insertions, 4 deletions
diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py
index b3fde52..bb2b845 100755
--- a/backend/hist_data/enwiki/gen_desc_data.py
+++ b/backend/hist_data/enwiki/gen_desc_data.py
@@ -7,14 +7,14 @@ and adds them to a database
# In testing, this script took over 10 hours to run, and generated about 5GB
+import argparse
import sys, os, re
-import bz2
-import html, mwxml, mwparserfromhell
+import bz2, html, mwxml, mwparserfromhell
import sqlite3
DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
DB_FILE = 'desc_data.db'
-
+# Regexps
DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
@@ -119,7 +119,6 @@ def convertTitle(title: str) -> str:
return html.unescape(title).replace('_', ' ')
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
#