# pylint:disable-msg=E0611,I1101
"""
Custom parsers and XPath expressions for date extraction
"""
import logging
import re
from datetime import datetime
from functools import lru_cache
from typing import List, Optional, Pattern, Tuple
# coverage for date parsing
from dateparser import DateDataParser # type: ignore # third-party, slow
from dateparser_data.settings import default_parsers
from dateutil.parser import parse as dateutil_parse
from lxml.etree import XPath
from lxml.html import HtmlElement
# own
from .settings import CACHE_SIZE
from .utils import Extractor, trim_text
from .validators import convert_date, is_valid_date
LOGGER = logging.getLogger(__name__)
EXTERNAL_PARSER = DateDataParser(
languages=None,
locales=None,
region=None,
settings={
"NORMALIZE": True, # False may be faster
"PARSERS": [
p
for p in default_parsers
if p not in ("no-spaces-time", "relative-time", "timestamp")
],
"PREFER_DATES_FROM": "past",
"PREFER_LOCALE_DATE_ORDER": True,
"RETURN_AS_TIMEZONE_AWARE": False,
"STRICT_PARSING": True,
},
)
FAST_PREPEND = ".//*[(self::div or self::h2 or self::h3 or self::h4 or self::li or self::p or self::span or self::time or self::ul)]"
# self::b or self::em or self::font or self::i or self::strong
SLOW_PREPEND = ".//*"
DATE_EXPRESSIONS = """
[
contains(translate(@id|@class|@itemprop, "D", "d"), 'date') or
contains(translate(@id|@class|@itemprop, "D", "d"), 'datum') or
contains(translate(@id|@class, "M", "m"), 'meta') or
contains(@id|@class, 'time') or
contains(@id|@class, 'publish') or
contains(@id|@class, 'footer') or
contains(@class, 'info') or
contains(@class, 'post_detail') or
contains(@class, 'block-content') or
contains(@class, 'byline') or
contains(@class, 'subline') or
contains(@class, 'posted') or
contains(@class, 'submitted') or
contains(@class, 'created-post') or
contains(@class, 'publication') or
contains(@class, 'author') or
contains(@class, 'autor') or
contains(@class, 'field-content') or
contains(@class, 'fa-clock-o') or
contains(@class, 'fa-calendar') or
contains(@class, 'fecha') or
contains(@class, 'parution')
] |
.//footer | .//small
"""
# further tests needed:
# or contains(@class, 'article')
# or contains(@id, 'lastmod') or contains(@class, 'updated')
FREE_TEXT_EXPRESSIONS = XPath(FAST_PREPEND + "/text()")
MIN_SEGMENT_LEN = 6
MAX_SEGMENT_LEN = 52
# discard parts of the webpage
# archive.org banner inserts
DISCARD_EXPRESSIONS = XPath(""".//div[@id="wm-ipp-base" or @id="wm-ipp"]""")
# not discarded for consistency (see above):
# .//footer
# .//*[(self::div or self::section)][@id="footer" or @class="footer"]
DAY_RE = "[0-3]?[0-9]"
MONTH_RE = "[0-1]?[0-9]"
YEAR_RE = "199[0-9]|20[0-3][0-9]"
# regex cache
YMD_NO_SEP_PATTERN = re.compile(r"\b(\d{8})\b")
YMD_PATTERN = re.compile(
rf"(?:\D|^)(?:(?P<year>{YEAR_RE})[\-/.](?P<month>{MONTH_RE})[\-/.](?P<day>{DAY_RE})|"
rf"(?P<day2>{DAY_RE})[\-/.](?P<month2>{MONTH_RE})[\-/.](?P<year2>\d{{2,4}}))(?:\D|$)"
)
YM_PATTERN = re.compile(
rf"(?:\D|^)(?:(?P<year>{YEAR_RE})[\-/.](?P<month>{MONTH_RE})|"
rf"(?P<month2>{MONTH_RE})[\-/.](?P<year2>{YEAR_RE}))(?:\D|$)"
)
REGEX_MONTHS = """
January?|February?|March|A[pv]ril|Ma[iy]|Jun[ei]|Jul[iy]|August|September|O[ck]tober|November|De[csz]ember|
Jan|Feb|M[aä]r|Apr|Jun|Jul|Aug|Sep|O[ck]t|Nov|De[cz]|
Januari|Februari|Maret|Mei|Agustus|
Jänner|Feber|März|
janvier|février|mars|juin|juillet|aout|septembre|octobre|novembre|décembre|
Ocak|Şubat|Mart|Nisan|Mayıs|Haziran|Temmuz|Ağustos|Eylül|Ekim|Kasım|Aralık|
Oca|Şub|Mar|Nis|Haz|Tem|Ağu|Eyl|Eki|Kas|Ara
""" # todo: check "août"
LONG_TEXT_PATTERN = re.compile(
rf"""(?P<month>{REGEX_MONTHS})\s
(?P<day>{DAY_RE})(?:st|nd|rd|th)?,? (?P<year>{YEAR_RE})|
(?P<day2>{DAY_RE})(?:st|nd|rd|th|\.)? (?:of )?
(?P<month2>{REGEX_MONTHS})[,.]? (?P<year2>{YEAR_RE})""".replace(
"\n", ""
),
re.I,
)
COMPLETE_URL = re.compile(rf"\D({YEAR_RE})[/_-]({MONTH_RE})[/_-]({DAY_RE})(?:\D|$)")
JSON_MODIFIED = re.compile(rf'"dateModified": ?"({YEAR_RE}-{MONTH_RE}-{DAY_RE})', re.I)
JSON_PUBLISHED = re.compile(
rf'"datePublished": ?"({YEAR_RE}-{MONTH_RE}-{DAY_RE})', re.I
)
TIMESTAMP_PATTERN = re.compile(
rf"({YEAR_RE}-{MONTH_RE}-{DAY_RE}).[0-9]{{2}}:[0-9]{{2}}:[0-9]{{2}}"
)
# English, French, German, Indonesian and Turkish dates cache
MONTHS = [
("jan", "januar", "jänner", "january", "januari", "janvier", "ocak", "oca"),
("feb", "februar", "feber", "february", "februari", "février", "şubat", "şub"),
("mar", "mär", "märz", "march", "maret", "mart", "mars"),
("apr", "april", "avril", "nisan", "nis"),
("may", "mai", "mei", "mayıs"),
("jun", "juni", "june", "juin", "haziran", "haz"),
("jul", "juli", "july", "juillet", "temmuz", "tem"),
("aug", "august", "agustus", "ağustos", "ağu", "aout"),
("sep", "september", "septembre", "eylül", "eyl"),
("oct", "oktober", "october", "octobre", "okt", "ekim", "eki"),
("nov", "november", "kasım", "kas", "novembre"),
("dec", "dez", "dezember", "december", "desember", "décembre", "aralık", "ara"),
]
TEXT_MONTHS = {
month: mnum for mnum, mlist in enumerate(MONTHS, start=1) for month in mlist
}
TEXT_DATE_PATTERN = re.compile(r"[.:,_/ -]|^\d+$")
DISCARD_PATTERNS = re.compile(
r"^\d{2}:\d{2}(?: |:|$)|"
r"^\D*\d{4}\D*$|"
r"[$€¥Ұ£¢₽₱฿#₹]|" # currency symbols and special characters
r"[A-Z]{3}[^A-Z]|" # currency codes
r"(?:^|\D)(?:\+\d{2}|\d{3}|\d{5})\D|" # tel./IPs/postal codes
r"ftps?|https?|sftp|" # protocols
r"\.(?:com|net|org|info|gov|edu|de|fr|io)\b|" # TLDs
r"IBAN|[A-Z]{2}[0-9]{2}|" # bank accounts
r"®" # ©
)
# use of regex module for speed?
TEXT_PATTERNS = re.compile(
r'(?:date[^0-9"]{,20}|updated|published|on)(?:[ :])*?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|' # EN
r"(?:Datum|Stand|Veröffentlicht am):? ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|" # DE
r"(?:güncellen?me|yayı(?:m|n)lan?ma) *?(?:tarihi)? *?:? *?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|"
r"([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4}) *?(?:'de|'da|'te|'ta|’de|’da|’te|’ta|tarihinde) *(?:güncellendi|yayı(?:m|n)landı)", # TR
re.I,
)
# core patterns
THREE_COMP_REGEX_A = re.compile(rf"({DAY_RE})[/.-]({MONTH_RE})[/.-]({YEAR_RE})")
THREE_COMP_REGEX_B = re.compile(
rf"({DAY_RE})/({MONTH_RE})/([0-9]{{2}})|({DAY_RE})[.-]({MONTH_RE})[.-]([0-9]{{2}})"
)
TWO_COMP_REGEX = re.compile(rf"({MONTH_RE})[/.-]({YEAR_RE})")
# extensive search patterns
YEAR_PATTERN = re.compile(rf"^\D?({YEAR_RE})")
COPYRIGHT_PATTERN = re.compile(
rf"(?:©|\©|Copyright|\(c\))\D*(?:{YEAR_RE}-)?({YEAR_RE})\D"
)
THREE_PATTERN = re.compile(r"/([0-9]{4}/[0-9]{2}/[0-9]{2})[01/]")
THREE_CATCH = re.compile(r"([0-9]{4})/([0-9]{2})/([0-9]{2})")
THREE_LOOSE_PATTERN = re.compile(r"\D([0-9]{4}[/.-][0-9]{2}[/.-][0-9]{2})\D")
THREE_LOOSE_CATCH = re.compile(r"([0-9]{4})[/.-]([0-9]{2})[/.-]([0-9]{2})")
SELECT_YMD_PATTERN = re.compile(r"\D([0-3]?[0-9][/.-][01]?[0-9][/.-][0-9]{4})\D")
SELECT_YMD_YEAR = re.compile(rf"({YEAR_RE})\D?$")
YMD_YEAR = re.compile(rf"^({YEAR_RE})")
DATESTRINGS_PATTERN = re.compile(
r"(\D19[0-9]{2}[01][0-9][0-3][0-9]\D|\D20[0-9]{2}[01][0-9][0-3][0-9]\D)"
)
DATESTRINGS_CATCH = re.compile(rf"({YEAR_RE})([01][0-9])([0-3][0-9])")
SLASHES_PATTERN = re.compile(
r"\D([0-3]?[0-9]/[01]?[0-9]/[0129][0-9]|[0-3][0-9]\.[01][0-9]\.[0129][0-9])\D"
)
SLASHES_YEAR = re.compile(r"([0-9]{2})$")
YYYYMM_PATTERN = re.compile(r"\D([12][0-9]{3}[/.-](?:1[0-2]|0[1-9]))\D")
YYYYMM_CATCH = re.compile(rf"({YEAR_RE})[/.-](1[0-2]|0[1-9]|)")
MMYYYY_PATTERN = re.compile(r"\D([01]?[0-9][/.-][12][0-9]{3})\D")
MMYYYY_YEAR = re.compile(rf"({YEAR_RE})\D?$")
SIMPLE_PATTERN = re.compile(rf"(?<!w3.org)\D({YEAR_RE})\D")
def discard_unwanted(tree: HtmlElement) -> Tuple[HtmlElement, List[HtmlElement]]:
"""Delete unwanted sections of an HTML document and return them as a list"""
my_discarded = []
for subtree in DISCARD_EXPRESSIONS(tree):
my_discarded.append(subtree)
subtree.getparent().remove(subtree)
return tree, my_discarded
def correct_year(year: int) -> int:
"""Adapt year from YY to YYYY format"""
if year < 100:
year += 1900 if year >= 90 else 2000
return year
def try_swap_values(day: int, month: int) -> Tuple[int, int]:
"""Swap day and month values if it seems feaaible."""
# If month is more than 12, swap it with the day
if month > 12 and day <= 12:
day, month = month, day
return day, month
[docs]
def regex_parse(string: str) -> Optional[datetime]:
"""Try full-text parse for date elements using a series of regular expressions
with particular emphasis on English, French, German and Turkish"""
# https://github.com/vi3k6i5/flashtext ?
# multilingual day-month-year + American English patterns
match = LONG_TEXT_PATTERN.search(string)
if not match:
return None
# process and return
try:
groups = (
("day", "month", "year")
if match.lastgroup == "year"
else ("day2", "month2", "year2")
)
day, month, year = (
int(match.group(groups[0])),
int(TEXT_MONTHS[match.group(groups[1]).lower().strip(".")]),
int(match.group(groups[2])),
)
year = correct_year(year)
day, month = try_swap_values(day, month)
dateobject = datetime(year, month, day)
except ValueError:
return None
LOGGER.debug("multilingual text found: %s", dateobject)
return dateobject
[docs]
def custom_parse(
string: str, outputformat: str, min_date: datetime, max_date: datetime
) -> Optional[str]:
"""Try to bypass the slow dateparser"""
LOGGER.debug("custom parse test: %s", string)
# 1. shortcut
if string[:4].isdigit():
candidate = None
# a. '201709011234' not covered by dateparser, and regex too slow
if string[4:8].isdigit():
try:
candidate = datetime(
int(string[:4]), int(string[4:6]), int(string[6:8])
)
except ValueError:
LOGGER.debug("8-digit error: %s", string[:8]) # return None
# b. much faster than extensive parsing
else:
try:
candidate = datetime.fromisoformat(string) # type: ignore[attr-defined]
except ValueError:
LOGGER.debug("not an ISO date string: %s", string)
try:
candidate = dateutil_parse(string, fuzzy=False) # ignoretz=True
except (OverflowError, TypeError, ValueError):
LOGGER.debug("dateutil parsing error: %s", string)
# c. plausibility test
if candidate is not None and (
is_valid_date(candidate, outputformat, earliest=min_date, latest=max_date)
):
LOGGER.debug("parsing result: %s", candidate)
return candidate.strftime(outputformat)
# 2. Try YYYYMMDD, use regex
match = YMD_NO_SEP_PATTERN.search(string)
if match:
try:
year, month, day = int(match[1][:4]), int(match[1][4:6]), int(match[1][6:8])
candidate = datetime(year, month, day)
except ValueError:
LOGGER.debug("YYYYMMDD value error: %s", match[0])
else:
if is_valid_date(candidate, "%Y-%m-%d", earliest=min_date, latest=max_date):
LOGGER.debug("YYYYMMDD match: %s", candidate)
return candidate.strftime(outputformat)
# 3. Try the very common YMD, Y-M-D, and D-M-Y patterns
match = YMD_PATTERN.search(string)
if match:
try:
if match.lastgroup == "day":
year, month, day = (
int(match.group("year")),
int(match.group("month")),
int(match.group("day")),
)
else:
day, month, year = (
int(match.group("day2")),
int(match.group("month2")),
int(match.group("year2")),
)
year = correct_year(year)
day, month = try_swap_values(day, month)
candidate = datetime(year, month, day)
except ValueError: # pragma: no cover
LOGGER.debug("regex value error: %s", match[0])
else:
if is_valid_date(candidate, "%Y-%m-%d", earliest=min_date, latest=max_date):
LOGGER.debug("regex match: %s", candidate)
return candidate.strftime(outputformat)
# 4. Try the Y-M and M-Y patterns
match = YM_PATTERN.search(string)
if match:
try:
if match.lastgroup == "month":
candidate = datetime(
int(match.group("year")), int(match.group("month")), 1
)
else:
candidate = datetime(
int(match.group("year2")), int(match.group("month2")), 1
)
except ValueError: # pragma: no cover
LOGGER.debug("Y-M value error: %s", match[0])
else:
if is_valid_date(candidate, "%Y-%m-%d", earliest=min_date, latest=max_date):
LOGGER.debug("Y-M match: %s", candidate)
return candidate.strftime(outputformat)
# 5. Try the other regex pattern
dateobject = regex_parse(string)
if is_valid_date(dateobject, outputformat, earliest=min_date, latest=max_date):
try:
LOGGER.debug("custom parse result: %s", dateobject)
return dateobject.strftime(outputformat) # type: ignore
except ValueError as err:
LOGGER.error("value error during conversion: %s %s", string, err)
return None
[docs]
def external_date_parser(string: str, outputformat: str) -> Optional[str]:
"""Use dateutil parser or dateparser module according to system settings"""
LOGGER.debug("send to external parser: %s", string)
try:
target = EXTERNAL_PARSER.get_date_data(string)["date_obj"]
# 2 types of errors possible
except (OverflowError, ValueError) as err: # pragma: no cover
target = None
LOGGER.error("external parser error: %s %s", string, err)
# issue with data type
return datetime.strftime(target, outputformat) if target is not None else None
[docs]
@lru_cache(maxsize=CACHE_SIZE)
def try_date_expr(
string: Optional[str],
outputformat: str,
extensive_search: bool,
min_date: datetime,
max_date: datetime,
) -> Optional[str]:
"""Use a series of heuristics and rules to parse a potential date expression"""
if not string:
return None
# trim
string = trim_text(string)[:MAX_SEGMENT_LEN]
# formal constraint: 4 to 18 digits
if not string or not 4 <= sum(map(str.isdigit, string)) <= 18:
return None
# check if string only contains time/single year or digits and not a date
if DISCARD_PATTERNS.search(string):
return None
# try to parse using the faster method
customresult = custom_parse(string, outputformat, min_date, max_date)
if customresult is not None:
return customresult
# use slow but extensive search
if extensive_search:
# additional filters to prevent computational cost
if not TEXT_DATE_PATTERN.search(string):
return None
# send to date parser
dateparser_result = external_date_parser(string, outputformat)
if is_valid_date(
dateparser_result, outputformat, earliest=min_date, latest=max_date
):
return dateparser_result
return None
def img_search(
tree: HtmlElement,
options: Extractor,
) -> Optional[str]:
"""Skim through image elements"""
element = tree.find('.//meta[@property="og:image"][@content]')
if element is not None:
result = extract_url_date(
element.get("content"),
options,
)
if result is not None:
return result
return None
def pattern_search(
text: str,
date_pattern: Pattern[str],
options: Extractor,
) -> Optional[str]:
"Look for date expressions using a regular expression on a string of text."
match = date_pattern.search(text)
if match and is_valid_date(
match[1], "%Y-%m-%d", earliest=options.min, latest=options.max
):
LOGGER.debug("regex found: %s %s", date_pattern, match[0])
return convert_date(match[1], "%Y-%m-%d", options.format)
return None
def json_search(
tree: HtmlElement,
options: Extractor,
) -> Optional[str]:
"""Look for JSON time patterns in JSON sections of the tree"""
# determine pattern
json_pattern = JSON_PUBLISHED if options.original else JSON_MODIFIED
# look throughout the HTML tree
for elem in tree.xpath(
'.//script[@type="application/ld+json" or @type="application/settings+json"]'
):
if not elem.text or '"date' not in elem.text:
continue
return pattern_search(elem.text, json_pattern, options)
return None
def idiosyncrasies_search(
htmlstring: str,
options: Extractor,
) -> Optional[str]:
"""Look for author-written dates throughout the web page"""
match = TEXT_PATTERNS.search(htmlstring) # EN+DE+TR
if match:
parts = list(filter(None, match.groups()))
if len(parts) == 3:
candidate = None
if len(parts[0]) == 4:
candidate = datetime(int(parts[0]), int(parts[1]), int(parts[2]))
elif len(parts[2]) in (2, 4):
# DD/MM/YY
day, month = try_swap_values(int(parts[0]), int(parts[1]))
year = correct_year(int(parts[2]))
try:
candidate = datetime(year, month, day)
except ValueError:
LOGGER.debug("value error in idiosyncrasies: %s", match[0])
if is_valid_date(
candidate, "%Y-%m-%d", earliest=options.min, latest=options.max
):
return candidate.strftime(options.format) # type: ignore[union-attr]
return None