Source code for htmldate.extractors

# pylint:disable-msg=E0611,I1101
"""
Custom parsers and XPath expressions for date extraction
"""

import logging
import re

from datetime import datetime
from functools import lru_cache
from typing import List, Optional, Pattern, Tuple

# coverage for date parsing
from dateparser import DateDataParser  # type: ignore  # third-party, slow
from dateparser_data.settings import default_parsers

from dateutil.parser import parse as dateutil_parse

from lxml.etree import XPath
from lxml.html import HtmlElement

# own
from .settings import CACHE_SIZE
from .utils import Extractor, trim_text
from .validators import convert_date, is_valid_date


LOGGER = logging.getLogger(__name__)

EXTERNAL_PARSER = DateDataParser(
    languages=None,
    locales=None,
    region=None,
    settings={
        "NORMALIZE": True,  # False may be faster
        "PARSERS": [
            p
            for p in default_parsers
            if p not in ("no-spaces-time", "relative-time", "timestamp")
        ],
        "PREFER_DATES_FROM": "past",
        "PREFER_LOCALE_DATE_ORDER": True,
        "RETURN_AS_TIMEZONE_AWARE": False,
        "STRICT_PARSING": True,
    },
)


FAST_PREPEND = ".//*[(self::div or self::h2 or self::h3 or self::h4 or self::li or self::p or self::span or self::time or self::ul)]"
# self::b or self::em or self::font or self::i or self::strong
SLOW_PREPEND = ".//*"

DATE_EXPRESSIONS = """
[
    contains(translate(@id|@class|@itemprop, "D", "d"), 'date') or
    contains(translate(@id|@class|@itemprop, "D", "d"), 'datum') or
    contains(translate(@id|@class, "M", "m"), 'meta') or
    contains(@id|@class, 'time') or
    contains(@id|@class, 'publish') or
    contains(@id|@class, 'footer') or
    contains(@class, 'info') or
    contains(@class, 'post_detail') or
    contains(@class, 'block-content') or
    contains(@class, 'byline') or
    contains(@class, 'subline') or
    contains(@class, 'posted') or
    contains(@class, 'submitted') or
    contains(@class, 'created-post') or
    contains(@class, 'publication') or
    contains(@class, 'author') or
    contains(@class, 'autor') or
    contains(@class, 'field-content') or
    contains(@class, 'fa-clock-o') or
    contains(@class, 'fa-calendar') or
    contains(@class, 'fecha') or
    contains(@class, 'parution')
] |
.//footer | .//small
"""

# further tests needed:
# or contains(@class, 'article')
# or contains(@id, 'lastmod') or contains(@class, 'updated')

FREE_TEXT_EXPRESSIONS = XPath(FAST_PREPEND + "/text()")
MIN_SEGMENT_LEN = 6
MAX_SEGMENT_LEN = 52

# discard parts of the webpage
# archive.org banner inserts
DISCARD_EXPRESSIONS = XPath(""".//div[@id="wm-ipp-base" or @id="wm-ipp"]""")
# not discarded for consistency (see above):
# .//footer
# .//*[(self::div or self::section)][@id="footer" or @class="footer"]

DAY_RE = "[0-3]?[0-9]"
MONTH_RE = "[0-1]?[0-9]"
YEAR_RE = "199[0-9]|20[0-3][0-9]"

# regex cache
YMD_NO_SEP_PATTERN = re.compile(r"\b(\d{8})\b")
YMD_PATTERN = re.compile(
    rf"(?:\D|^)(?:(?P<year>{YEAR_RE})[\-/.](?P<month>{MONTH_RE})[\-/.](?P<day>{DAY_RE})|"
    rf"(?P<day2>{DAY_RE})[\-/.](?P<month2>{MONTH_RE})[\-/.](?P<year2>\d{{2,4}}))(?:\D|$)"
)
YM_PATTERN = re.compile(
    rf"(?:\D|^)(?:(?P<year>{YEAR_RE})[\-/.](?P<month>{MONTH_RE})|"
    rf"(?P<month2>{MONTH_RE})[\-/.](?P<year2>{YEAR_RE}))(?:\D|$)"
)

REGEX_MONTHS = """
January?|February?|March|A[pv]ril|Ma[iy]|Jun[ei]|Jul[iy]|August|September|O[ck]tober|November|De[csz]ember|
Jan|Feb|M[aä]r|Apr|Jun|Jul|Aug|Sep|O[ck]t|Nov|De[cz]|
Januari|Februari|Maret|Mei|Agustus|
Jänner|Feber|März|
janvier|février|mars|juin|juillet|aout|septembre|octobre|novembre|décembre|
Ocak|Şubat|Mart|Nisan|Mayıs|Haziran|Temmuz|Ağustos|Eylül|Ekim|Kasım|Aralık|
Oca|Şub|Mar|Nis|Haz|Tem|Ağu|Eyl|Eki|Kas|Ara
"""  # todo: check "août"
LONG_TEXT_PATTERN = re.compile(
    rf"""(?P<month>{REGEX_MONTHS})\s
(?P<day>{DAY_RE})(?:st|nd|rd|th)?,? (?P<year>{YEAR_RE})|
(?P<day2>{DAY_RE})(?:st|nd|rd|th|\.)? (?:of )?
(?P<month2>{REGEX_MONTHS})[,.]? (?P<year2>{YEAR_RE})""".replace(
        "\n", ""
    ),
    re.I,
)

COMPLETE_URL = re.compile(rf"\D({YEAR_RE})[/_-]({MONTH_RE})[/_-]({DAY_RE})(?:\D|$)")

JSON_MODIFIED = re.compile(rf'"dateModified": ?"({YEAR_RE}-{MONTH_RE}-{DAY_RE})', re.I)
JSON_PUBLISHED = re.compile(
    rf'"datePublished": ?"({YEAR_RE}-{MONTH_RE}-{DAY_RE})', re.I
)
TIMESTAMP_PATTERN = re.compile(
    rf"({YEAR_RE}-{MONTH_RE}-{DAY_RE}).[0-9]{{2}}:[0-9]{{2}}:[0-9]{{2}}"
)

# English, French, German, Indonesian and Turkish dates cache
MONTHS = [
    ("jan", "januar", "jänner", "january", "januari", "janvier", "ocak", "oca"),
    ("feb", "februar", "feber", "february", "februari", "février", "şubat", "şub"),
    ("mar", "mär", "märz", "march", "maret", "mart", "mars"),
    ("apr", "april", "avril", "nisan", "nis"),
    ("may", "mai", "mei", "mayıs"),
    ("jun", "juni", "june", "juin", "haziran", "haz"),
    ("jul", "juli", "july", "juillet", "temmuz", "tem"),
    ("aug", "august", "agustus", "ağustos", "ağu", "aout"),
    ("sep", "september", "septembre", "eylül", "eyl"),
    ("oct", "oktober", "october", "octobre", "okt", "ekim", "eki"),
    ("nov", "november", "kasım", "kas", "novembre"),
    ("dec", "dez", "dezember", "december", "desember", "décembre", "aralık", "ara"),
]

TEXT_MONTHS = {
    month: mnum for mnum, mlist in enumerate(MONTHS, start=1) for month in mlist
}

TEXT_DATE_PATTERN = re.compile(r"[.:,_/ -]|^\d+$")

DISCARD_PATTERNS = re.compile(
    r"^\d{2}:\d{2}(?: |:|$)|"
    r"^\D*\d{4}\D*$|"
    r"[$€¥Ұ£¢₽₱฿#₹]|"  # currency symbols and special characters
    r"[A-Z]{3}[^A-Z]|"  # currency codes
    r"(?:^|\D)(?:\+\d{2}|\d{3}|\d{5})\D|"  # tel./IPs/postal codes
    r"ftps?|https?|sftp|"  # protocols
    r"\.(?:com|net|org|info|gov|edu|de|fr|io)\b|"  # TLDs
    r"IBAN|[A-Z]{2}[0-9]{2}|"  # bank accounts
    r"®"  # ©
)

# use of regex module for speed?
TEXT_PATTERNS = re.compile(
    r'(?:date[^0-9"]{,20}|updated|published|on)(?:[ :])*?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|'  # EN
    r"(?:Datum|Stand|Veröffentlicht am):? ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|"  # DE
    r"(?:güncellen?me|yayı(?:m|n)lan?ma) *?(?:tarihi)? *?:? *?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|"
    r"([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4}) *?(?:'de|'da|'te|'ta|’de|’da|’te|’ta|tarihinde) *(?:güncellendi|yayı(?:m|n)landı)",  # TR
    re.I,
)

# core patterns
THREE_COMP_REGEX_A = re.compile(rf"({DAY_RE})[/.-]({MONTH_RE})[/.-]({YEAR_RE})")
THREE_COMP_REGEX_B = re.compile(
    rf"({DAY_RE})/({MONTH_RE})/([0-9]{{2}})|({DAY_RE})[.-]({MONTH_RE})[.-]([0-9]{{2}})"
)
TWO_COMP_REGEX = re.compile(rf"({MONTH_RE})[/.-]({YEAR_RE})")

# extensive search patterns
YEAR_PATTERN = re.compile(rf"^\D?({YEAR_RE})")
COPYRIGHT_PATTERN = re.compile(
    rf"(?:©|\&copy;|Copyright|\(c\))\D*(?:{YEAR_RE}-)?({YEAR_RE})\D"
)
THREE_PATTERN = re.compile(r"/([0-9]{4}/[0-9]{2}/[0-9]{2})[01/]")
THREE_CATCH = re.compile(r"([0-9]{4})/([0-9]{2})/([0-9]{2})")
THREE_LOOSE_PATTERN = re.compile(r"\D([0-9]{4}[/.-][0-9]{2}[/.-][0-9]{2})\D")
THREE_LOOSE_CATCH = re.compile(r"([0-9]{4})[/.-]([0-9]{2})[/.-]([0-9]{2})")
SELECT_YMD_PATTERN = re.compile(r"\D([0-3]?[0-9][/.-][01]?[0-9][/.-][0-9]{4})\D")
SELECT_YMD_YEAR = re.compile(rf"({YEAR_RE})\D?$")
YMD_YEAR = re.compile(rf"^({YEAR_RE})")
DATESTRINGS_PATTERN = re.compile(
    r"(\D19[0-9]{2}[01][0-9][0-3][0-9]\D|\D20[0-9]{2}[01][0-9][0-3][0-9]\D)"
)
DATESTRINGS_CATCH = re.compile(rf"({YEAR_RE})([01][0-9])([0-3][0-9])")
SLASHES_PATTERN = re.compile(
    r"\D([0-3]?[0-9]/[01]?[0-9]/[0129][0-9]|[0-3][0-9]\.[01][0-9]\.[0129][0-9])\D"
)
SLASHES_YEAR = re.compile(r"([0-9]{2})$")
YYYYMM_PATTERN = re.compile(r"\D([12][0-9]{3}[/.-](?:1[0-2]|0[1-9]))\D")
YYYYMM_CATCH = re.compile(rf"({YEAR_RE})[/.-](1[0-2]|0[1-9]|)")
MMYYYY_PATTERN = re.compile(r"\D([01]?[0-9][/.-][12][0-9]{3})\D")
MMYYYY_YEAR = re.compile(rf"({YEAR_RE})\D?$")
SIMPLE_PATTERN = re.compile(rf"(?<!w3.org)\D({YEAR_RE})\D")


def discard_unwanted(tree: HtmlElement) -> Tuple[HtmlElement, List[HtmlElement]]:
    """Delete unwanted sections of an HTML document and return them as a list"""
    my_discarded = []
    for subtree in DISCARD_EXPRESSIONS(tree):
        my_discarded.append(subtree)
        subtree.getparent().remove(subtree)
    return tree, my_discarded


[docs] def extract_url_date( testurl: Optional[str], options: Extractor, ) -> Optional[str]: """Extract the date out of an URL string complying with the Y-M-D format""" if testurl is not None: match = COMPLETE_URL.search(testurl) if match: LOGGER.debug("found date in URL: %s", match[0]) try: dateobject = datetime(int(match[1]), int(match[2]), int(match[3])) if is_valid_date( dateobject, options.format, earliest=options.min, latest=options.max ): return dateobject.strftime(options.format) except ValueError as err: # pragma: no cover LOGGER.debug("conversion error: %s %s", match[0], err) return None
def correct_year(year: int) -> int: """Adapt year from YY to YYYY format""" if year < 100: year += 1900 if year >= 90 else 2000 return year def try_swap_values(day: int, month: int) -> Tuple[int, int]: """Swap day and month values if it seems feaaible.""" # If month is more than 12, swap it with the day if month > 12 and day <= 12: day, month = month, day return day, month
[docs] def regex_parse(string: str) -> Optional[datetime]: """Try full-text parse for date elements using a series of regular expressions with particular emphasis on English, French, German and Turkish""" # https://github.com/vi3k6i5/flashtext ? # multilingual day-month-year + American English patterns match = LONG_TEXT_PATTERN.search(string) if not match: return None # process and return try: groups = ( ("day", "month", "year") if match.lastgroup == "year" else ("day2", "month2", "year2") ) day, month, year = ( int(match.group(groups[0])), int(TEXT_MONTHS[match.group(groups[1]).lower().strip(".")]), int(match.group(groups[2])), ) year = correct_year(year) day, month = try_swap_values(day, month) dateobject = datetime(year, month, day) except ValueError: return None LOGGER.debug("multilingual text found: %s", dateobject) return dateobject
[docs] def custom_parse( string: str, outputformat: str, min_date: datetime, max_date: datetime ) -> Optional[str]: """Try to bypass the slow dateparser""" LOGGER.debug("custom parse test: %s", string) # 1. shortcut if string[:4].isdigit(): candidate = None # a. '201709011234' not covered by dateparser, and regex too slow if string[4:8].isdigit(): try: candidate = datetime( int(string[:4]), int(string[4:6]), int(string[6:8]) ) except ValueError: LOGGER.debug("8-digit error: %s", string[:8]) # return None # b. much faster than extensive parsing else: try: candidate = datetime.fromisoformat(string) # type: ignore[attr-defined] except ValueError: LOGGER.debug("not an ISO date string: %s", string) try: candidate = dateutil_parse(string, fuzzy=False) # ignoretz=True except (OverflowError, TypeError, ValueError): LOGGER.debug("dateutil parsing error: %s", string) # c. plausibility test if candidate is not None and ( is_valid_date(candidate, outputformat, earliest=min_date, latest=max_date) ): LOGGER.debug("parsing result: %s", candidate) return candidate.strftime(outputformat) # 2. Try YYYYMMDD, use regex match = YMD_NO_SEP_PATTERN.search(string) if match: try: year, month, day = int(match[1][:4]), int(match[1][4:6]), int(match[1][6:8]) candidate = datetime(year, month, day) except ValueError: LOGGER.debug("YYYYMMDD value error: %s", match[0]) else: if is_valid_date(candidate, "%Y-%m-%d", earliest=min_date, latest=max_date): LOGGER.debug("YYYYMMDD match: %s", candidate) return candidate.strftime(outputformat) # 3. Try the very common YMD, Y-M-D, and D-M-Y patterns match = YMD_PATTERN.search(string) if match: try: if match.lastgroup == "day": year, month, day = ( int(match.group("year")), int(match.group("month")), int(match.group("day")), ) else: day, month, year = ( int(match.group("day2")), int(match.group("month2")), int(match.group("year2")), ) year = correct_year(year) day, month = try_swap_values(day, month) candidate = datetime(year, month, day) except ValueError: # pragma: no cover LOGGER.debug("regex value error: %s", match[0]) else: if is_valid_date(candidate, "%Y-%m-%d", earliest=min_date, latest=max_date): LOGGER.debug("regex match: %s", candidate) return candidate.strftime(outputformat) # 4. Try the Y-M and M-Y patterns match = YM_PATTERN.search(string) if match: try: if match.lastgroup == "month": candidate = datetime( int(match.group("year")), int(match.group("month")), 1 ) else: candidate = datetime( int(match.group("year2")), int(match.group("month2")), 1 ) except ValueError: # pragma: no cover LOGGER.debug("Y-M value error: %s", match[0]) else: if is_valid_date(candidate, "%Y-%m-%d", earliest=min_date, latest=max_date): LOGGER.debug("Y-M match: %s", candidate) return candidate.strftime(outputformat) # 5. Try the other regex pattern dateobject = regex_parse(string) if is_valid_date(dateobject, outputformat, earliest=min_date, latest=max_date): try: LOGGER.debug("custom parse result: %s", dateobject) return dateobject.strftime(outputformat) # type: ignore except ValueError as err: LOGGER.error("value error during conversion: %s %s", string, err) return None
[docs] def external_date_parser(string: str, outputformat: str) -> Optional[str]: """Use dateutil parser or dateparser module according to system settings""" LOGGER.debug("send to external parser: %s", string) try: target = EXTERNAL_PARSER.get_date_data(string)["date_obj"] # 2 types of errors possible except (OverflowError, ValueError) as err: # pragma: no cover target = None LOGGER.error("external parser error: %s %s", string, err) # issue with data type return datetime.strftime(target, outputformat) if target is not None else None
[docs] @lru_cache(maxsize=CACHE_SIZE) def try_date_expr( string: Optional[str], outputformat: str, extensive_search: bool, min_date: datetime, max_date: datetime, ) -> Optional[str]: """Use a series of heuristics and rules to parse a potential date expression""" if not string: return None # trim string = trim_text(string)[:MAX_SEGMENT_LEN] # formal constraint: 4 to 18 digits if not string or not 4 <= sum(map(str.isdigit, string)) <= 18: return None # check if string only contains time/single year or digits and not a date if DISCARD_PATTERNS.search(string): return None # try to parse using the faster method customresult = custom_parse(string, outputformat, min_date, max_date) if customresult is not None: return customresult # use slow but extensive search if extensive_search: # additional filters to prevent computational cost if not TEXT_DATE_PATTERN.search(string): return None # send to date parser dateparser_result = external_date_parser(string, outputformat) if is_valid_date( dateparser_result, outputformat, earliest=min_date, latest=max_date ): return dateparser_result return None
def img_search( tree: HtmlElement, options: Extractor, ) -> Optional[str]: """Skim through image elements""" element = tree.find('.//meta[@property="og:image"][@content]') if element is not None: result = extract_url_date( element.get("content"), options, ) if result is not None: return result return None def pattern_search( text: str, date_pattern: Pattern[str], options: Extractor, ) -> Optional[str]: "Look for date expressions using a regular expression on a string of text." match = date_pattern.search(text) if match and is_valid_date( match[1], "%Y-%m-%d", earliest=options.min, latest=options.max ): LOGGER.debug("regex found: %s %s", date_pattern, match[0]) return convert_date(match[1], "%Y-%m-%d", options.format) return None def json_search( tree: HtmlElement, options: Extractor, ) -> Optional[str]: """Look for JSON time patterns in JSON sections of the tree""" # determine pattern json_pattern = JSON_PUBLISHED if options.original else JSON_MODIFIED # look throughout the HTML tree for elem in tree.xpath( './/script[@type="application/ld+json" or @type="application/settings+json"]' ): if not elem.text or '"date' not in elem.text: continue return pattern_search(elem.text, json_pattern, options) return None def idiosyncrasies_search( htmlstring: str, options: Extractor, ) -> Optional[str]: """Look for author-written dates throughout the web page""" match = TEXT_PATTERNS.search(htmlstring) # EN+DE+TR if match: parts = list(filter(None, match.groups())) if len(parts) == 3: candidate = None if len(parts[0]) == 4: candidate = datetime(int(parts[0]), int(parts[1]), int(parts[2])) elif len(parts[2]) in (2, 4): # DD/MM/YY day, month = try_swap_values(int(parts[0]), int(parts[1])) year = correct_year(int(parts[2])) try: candidate = datetime(year, month, day) except ValueError: LOGGER.debug("value error in idiosyncrasies: %s", match[0]) if is_valid_date( candidate, "%Y-%m-%d", earliest=options.min, latest=options.max ): return candidate.strftime(options.format) # type: ignore[union-attr] return None