diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-06-03 11:03:25 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-06-03 11:03:25 +1000 |
| commit | 811946498edc472d91e5ca8d41a4a0568e0d6e8f (patch) | |
| tree | 3ec4f0950950bc7b3cff782f4f9c2b13f9b51cb3 /backend/data/enwiki/genData.py | |
| parent | 515e02b9453f7740d7429ad7e11d913e32e5ffdb (diff) | |
Adjust enwiki dump-index-db and lookup script to include wiki-ids
Diffstat (limited to 'backend/data/enwiki/genData.py')
| -rwxr-xr-x | backend/data/enwiki/genData.py | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py index f439d11..3e60bb5 100755 --- a/backend/data/enwiki/genData.py +++ b/backend/data/enwiki/genData.py @@ -18,7 +18,7 @@ enwikiDb = "enwikiData.db" # Some regexps and functions for parsing wikitext descLineRegex = re.compile("^ *[A-Z'\"]") embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$") - # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag + # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}") parensGrpRegex = re.compile(r" \([^()]*\)") leftoverBraceRegex = re.compile(r"(?:{\||{{).*") |
