# pylint:disable-msg=E0611,I1101
"""Module bundling all functions needed to determine the date of HTML strings
or LXML trees.

import logging
import re

from collections import Counter
from copy import deepcopy
from datetime import datetime
from functools import lru_cache, partial
from typing import Match, Optional, Pattern, Union, Counter as Counter_Type

from lxml.html import HtmlElement, tostring

# own
from .extractors import (
from .utils import Extractor, clean_html, load_html, trim_text
from .validators import (

LOGGER = logging.getLogger(__name__)

def logstring(element: HtmlElement) -> str:
    """Format the element to be logged to a string."""
    return tostring(element, pretty_print=False, encoding="unicode").strip()

    # Dublin Core:
    "meta",  # too loose?
    # Open Graph:



ITEMPROP_ATTRS_ORIGINAL = {"datecreated", "datepublished", "pubyear"}
ITEMPROP_ATTRS_MODIFIED = {"datemodified", "dateupdate"}
CLASS_ATTRS = {"date-published", "published", "time published"}

NON_DIGITS_REGEX = re.compile(r"\D+$")


def examine_text(
    text: str,
    options: Extractor,
) -> Optional[str]:
    "Prepare text and try to extract a date."
    text = trim_text(text)

    if len(text) <= MIN_SEGMENT_LEN:
        return None

    text = NON_DIGITS_REGEX.sub("", text[:MAX_SEGMENT_LEN])
    return try_date_expr(
        text, options.format, options.extensive, options.min, options.max

def examine_date_elements(
    tree: HtmlElement,
    expression: str,
    options: Extractor,
) -> Optional[str]:
    """Check HTML elements one by one for date expressions"""
    elements = tree.xpath(expression)
    if not elements or len(elements) > MAX_POSSIBLE_CANDIDATES:
        return None

    for elem in elements:
        # try element text and link title (Blogspot)
        for text in [elem.text_content(), elem.get("title", "")]:
            attempt = examine_text(text, options)
            if attempt:
                return attempt

    return None

[docs] def examine_header( tree: HtmlElement, options: Extractor, ) -> Optional[str]: """ Parse header elements to find date cues :param tree: LXML parsed tree object :type tree: LXML tree :param options: Options for extraction :type options: Extractor :return: Returns a valid date expression as a string, or None """ headerdate, reserve = None, None tryfunc = partial( try_date_expr, outputformat=options.format, extensive_search=options.extensive, min_date=options.min, max_date=options.max, ) # loop through all meta elements for elem in tree.iterfind(".//meta"): # safeguard if ( not elem.attrib or "content" not in elem.attrib and "datetime" not in elem.attrib ): continue # name attribute, most frequent if "name" in elem.attrib: attribute = elem.get("name", "").lower() # url if attribute == "og:url": reserve = extract_url_date(elem.get("content"), options) # date elif attribute in DATE_ATTRIBUTES: LOGGER.debug("examining meta name: %s", logstring(elem)) headerdate = tryfunc(elem.get("content")) # modified elif attribute in NAME_MODIFIED: LOGGER.debug("examining meta name: %s", logstring(elem)) if not options.original: headerdate = tryfunc(elem.get("content")) else: reserve = tryfunc(elem.get("content")) # property attribute elif "property" in elem.attrib: attribute = elem.get("property", "").lower() if attribute in DATE_ATTRIBUTES or attribute in PROPERTY_MODIFIED: LOGGER.debug("examining meta property: %s", logstring(elem)) attempt = tryfunc(elem.get("content")) if attempt is not None: if (attribute in DATE_ATTRIBUTES and options.original) or ( attribute in PROPERTY_MODIFIED and not options.original ): headerdate = attempt # hurts precision else: reserve = attempt # itemprop elif "itemprop" in elem.attrib: attribute = elem.get("itemprop", "").lower() # original: store / updated: override date if attribute in ITEMPROP_ATTRS: LOGGER.debug("examining meta itemprop: %s", logstring(elem)) attempt = tryfunc(elem.get("datetime") or elem.get("content")) # store value if attempt is not None: if (attribute in ITEMPROP_ATTRS_ORIGINAL and options.original) or ( attribute in ITEMPROP_ATTRS_MODIFIED and not options.original ): headerdate = attempt # put on hold: hurts precision # else: # reserve = attempt # reserve with copyrightyear elif attribute == "copyrightyear": LOGGER.debug("examining meta itemprop: %s", logstring(elem)) if "content" in elem.attrib: attempt = "-".join([elem.get("content", ""), "01", "01"]) if is_valid_date( attempt, "%Y-%m-%d", earliest=options.min, latest=options.max ): reserve = attempt # pubdate, relatively rare elif "pubdate" in elem.attrib: if elem.get("pubdate", "").lower() == "pubdate": LOGGER.debug("examining meta pubdate: %s", logstring(elem)) headerdate = tryfunc(elem.get("content")) # http-equiv, rare elif "http-equiv" in elem.attrib: attribute = elem.get("http-equiv", "").lower() if attribute == "date": LOGGER.debug("examining meta http-equiv: %s", logstring(elem)) if options.original: headerdate = tryfunc(elem.get("content")) else: reserve = tryfunc(elem.get("content")) elif attribute == "last-modified": LOGGER.debug("examining meta http-equiv: %s", logstring(elem)) if not options.original: headerdate = tryfunc(elem.get("content")) else: reserve = tryfunc(elem.get("content")) # exit loop if headerdate is not None: break # if nothing was found, look for lower granularity (so far: "copyright year") if headerdate is None and reserve is not None: LOGGER.debug("opting for reserve date with less granularity") headerdate = reserve # return value return headerdate
def select_candidate( occurrences: Counter_Type[str], catch: Pattern[str], yearpat: Pattern[str], options: Extractor, ) -> Optional[Match[str]]: """Select a candidate among the most frequent matches""" if not occurrences or len(occurrences) > MAX_POSSIBLE_CANDIDATES: return None if len(occurrences) == 1: match = if match: return match # select among most frequent: more than 10? more than 2 candidates? firstselect = occurrences.most_common(10) LOGGER.debug("firstselect: %s", firstselect) # sort and find probable candidates bestones = sorted(firstselect, reverse=not options.original)[:2] LOGGER.debug("bestones: %s", bestones) # plausibility heuristics patterns, counts = zip(*bestones) years = [""] * len(bestones) validation = [False] * len(bestones) for i, pattern in enumerate(patterns): year_match = if year_match: years[i] = year_match[1] dateobject = datetime(int(year_match[1]), 1, 1) validation[i] = is_valid_date( dateobject, "%Y", earliest=options.min, latest=options.max ) # safety net: plausibility match = None if all(validation): # same number of occurrences: always take top of the pile? if counts[0] == counts[1]: match =[0]) # safety net: newer date but up to 50% less frequent elif years[1] != years[0] and counts[1] / counts[0] > 0.5: match =[1]) # not newer or hopefully not significant else: match =[0]) elif any(validation): match =[validation.index(True)]) else: LOGGER.debug("no suitable candidate: %s %s", years[0], years[1]) return match def search_pattern( htmlstring: str, pattern: Pattern[str], catch: Pattern[str], yearpat: Pattern[str], options: Extractor, ) -> Optional[Match[str]]: """Chained candidate filtering and selection""" candidates = plausible_year_filter( htmlstring, pattern=pattern, yearpat=yearpat, earliest=options.min, latest=options.max, ) return select_candidate(candidates, catch, yearpat, options) @lru_cache(maxsize=CACHE_SIZE) def compare_reference( reference: int, expression: str, options: Extractor, ) -> int: """Compare candidate to current date reference (includes date validation and older/newer test)""" attempt = try_date_expr( expression, options.format, options.extensive, options.min, options.max ) if attempt is not None: return compare_values(reference, attempt, options) return reference def examine_abbr_elements( tree: HtmlElement, options: Extractor, ) -> Optional[str]: """Scan the page for abbr elements and check if their content contains an eligible date""" result = None elements = tree.findall(".//abbr") if elements is not None and len(elements) < MAX_POSSIBLE_CANDIDATES: reference = 0 for elem in elements: # data-utime (mostly Facebook) if "data-utime" in elem.attrib: try: candidate = int(elem.get("data-utime", "")) except ValueError: continue LOGGER.debug("data-utime found: %s", candidate) # look for original date if options.original and (reference == 0 or candidate < reference): reference = candidate # look for newest (i.e. largest time delta) elif not options.original and candidate > reference: reference = candidate # class elif elem.get("class") in CLASS_ATTRS: # other attributes if "title" in elem.attrib: trytext = elem.get("title") LOGGER.debug("abbr published-title found: %s", trytext) # shortcut if options.original: attempt = try_date_expr( trytext, options.format, options.extensive, options.min, options.max, ) if attempt is not None: return attempt else: reference = compare_reference(reference, trytext, options) # faster execution if reference > 0: break # dates, not times of the day elif elem.text and len(elem.text) > 10: LOGGER.debug("abbr published found: %s", elem.text) reference = compare_reference(reference, elem.text, options) converted = check_extracted_reference(reference, options) # return or try rescue in abbr content result = converted or examine_date_elements( tree, ".//abbr", options, ) return result def examine_time_elements( tree: HtmlElement, options: Extractor, ) -> Optional[str]: """Scan the page for time elements and check if their content contains an eligible date""" result = None elements = tree.findall(".//time") if elements is not None and len(elements) < MAX_POSSIBLE_CANDIDATES: # scan all the tags and look for the newest one reference = 0 for elem in elements: shortcut_flag = False # go for datetime if len(elem.get("datetime", "")) > 6: # shortcut: time pubdate if ( "pubdate" in elem.attrib and elem.get("pubdate") == "pubdate" and options.original ): shortcut_flag = True LOGGER.debug( "shortcut for time pubdate found: %s", elem.get("datetime") ) # shortcuts: class attribute elif "class" in elem.attrib: if options.original and ( elem.get("class", "").startswith("entry-date") or elem.get("class", "").startswith("entry-time") ): shortcut_flag = True LOGGER.debug( "shortcut for time/datetime found: %s", elem.get("datetime") ) # updated time elif not options.original and elem.get("class") == "updated": shortcut_flag = True LOGGER.debug( "shortcut for updated time/datetime found: %s", elem.get("datetime"), ) # datetime attribute else: LOGGER.debug("time/datetime found: %s", elem.get("datetime")) # analyze attribute if shortcut_flag: attempt = try_date_expr( elem.get("datetime"), options.format, options.extensive, options.min, options.max, ) if attempt is not None: return attempt else: reference = compare_reference( reference, elem.get("datetime"), options ) # bare text in element elif elem.text is not None and len(elem.text) > 6: LOGGER.debug("time/datetime found in text: %s", elem.text) reference = compare_reference(reference, elem.text, options) # else...? # return result = check_extracted_reference(reference, options) return result def normalize_match(match: Optional[Match[str]]) -> str: """Normalize string output by adding "0" if necessary, and optionally expand the year from two to four digits.""" day, month, year = (g.zfill(2) for g in match.groups() if g) # type: ignore[union-attr] if len(year) == 2: year = f"19{year}" if year[0] == "9" else f"20{year}" return f"{year}-{month}-{day}"
[docs] def search_page(htmlstring: str, options: Extractor) -> Optional[str]: """ Opportunistically search the HTML text for common text patterns :param htmlstring: The HTML document in string format, potentially cleaned and stripped to the core (much faster) :type htmlstring: string :param options: Define extraction options :type options: Extractor :return: Returns a valid date expression as a string, or None """ # copyright symbol LOGGER.debug("looking for copyright/footer information") copyear = 0 bestmatch = search_pattern( htmlstring, COPYRIGHT_PATTERN, YEAR_PATTERN, YEAR_PATTERN, options, ) if bestmatch is not None: LOGGER.debug("Copyright detected: %s", bestmatch[0]) dateobject = datetime(int(bestmatch[0]), 1, 1) if is_valid_date(bestmatch[0], "%Y", earliest=options.min, latest=options.max): LOGGER.debug("copyright year/footer pattern found: %s", bestmatch[0]) copyear = dateobject.year # 3 components LOGGER.debug("3 components") # target URL characteristics # then more loosely structured data for patterns in THREE_COMP_PATTERNS: bestmatch = search_pattern( htmlstring, patterns[0], patterns[1], YEAR_PATTERN, options, ) result = filter_ymd_candidate( bestmatch, patterns[0], options.original, copyear, options.format, options.min, options.max, ) if result is not None: return result # YYYY-MM-DD/DD-MM-YYYY candidates = plausible_year_filter( htmlstring, pattern=SELECT_YMD_PATTERN, yearpat=SELECT_YMD_YEAR, earliest=options.min, latest=options.max, ) # revert DD-MM-YYYY patterns before sorting replacement = {} for item in candidates: match = THREE_COMP_REGEX_A.match(item) candidate = normalize_match(match) replacement[candidate] = candidates[item] candidates = Counter(replacement) # select bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options) result = filter_ymd_candidate( bestmatch, SELECT_YMD_PATTERN, options.original, copyear, options.format, options.min, options.max, ) if result is not None: return result # valid dates strings bestmatch = search_pattern( htmlstring, DATESTRINGS_PATTERN, DATESTRINGS_CATCH, YEAR_PATTERN, options, ) result = filter_ymd_candidate( bestmatch, DATESTRINGS_PATTERN, options.original, copyear, options.format, options.min, options.max, ) if result is not None: return result # DD?/MM?/YY candidates = plausible_year_filter( htmlstring, pattern=SLASHES_PATTERN, yearpat=SLASHES_YEAR, earliest=options.min, latest=options.max, incomplete=True, ) # revert DD-MM-YYYY patterns before sorting replacement = {} for item in candidates: match = THREE_COMP_REGEX_B.match(item) candidate = normalize_match(match) replacement[candidate] = candidates[item] candidates = Counter(replacement) bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options) result = filter_ymd_candidate( bestmatch, SLASHES_PATTERN, options.original, copyear, options.format, options.min, options.max, ) if result is not None: return result # 2 components LOGGER.debug("switching to two components") # first option bestmatch = search_pattern( htmlstring, YYYYMM_PATTERN, YYYYMM_CATCH, YEAR_PATTERN, options, ) if bestmatch is not None: dateobject = datetime(int(bestmatch[1]), int(bestmatch[2]), 1) if is_valid_date( dateobject, "%Y-%m-%d", earliest=options.min, latest=options.max ) and (copyear == 0 or dateobject.year >= copyear): LOGGER.debug( 'date found for pattern "%s": %s, %s', YYYYMM_PATTERN, bestmatch[1], bestmatch[2], ) return dateobject.strftime(options.format) # 2 components, second option candidates = plausible_year_filter( htmlstring, pattern=MMYYYY_PATTERN, yearpat=MMYYYY_YEAR, earliest=options.min, latest=options.max, incomplete=options.original, ) # revert DD-MM-YYYY patterns before sorting replacement = {} for item in candidates: match = TWO_COMP_REGEX.match(item) month = match[1] # type: ignore[index] if len(month) == 1: month = f"0{month}" candidate = "-".join([match[2], month, "01"]) # type: ignore[index] replacement[candidate] = candidates[item] candidates = Counter(replacement) # select bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options) result = filter_ymd_candidate( bestmatch, MMYYYY_PATTERN, options.original, copyear, options.format, options.min, options.max, ) if result is not None: return result # try full-blown text regex on all HTML? dateobject = regex_parse(htmlstring) # type: ignore[assignment] # todo: find all candidates and disambiguate? if is_valid_date( dateobject, options.format, earliest=options.min, latest=options.max ) and (copyear == 0 or dateobject.year >= copyear): try: LOGGER.debug("regex result on HTML: %s", dateobject) return dateobject.strftime(options.format) except ValueError as err: LOGGER.error("value error during conversion: %s %s", dateobject, err) # catchall: copyright mention if copyear != 0: LOGGER.debug("using copyright year as default") return convert_date( "-".join([str(copyear), "01", "01"]), "%Y-%m-%d", options.format ) # last resort: 1 component LOGGER.debug("switching to one component") bestmatch = search_pattern( htmlstring, SIMPLE_PATTERN, YEAR_PATTERN, YEAR_PATTERN, options, ) if bestmatch is not None: dateobject = datetime(int(bestmatch[0]), 1, 1) if ( is_valid_date( dateobject, "%Y-%m-%d", earliest=options.min, latest=options.max ) and dateobject.year >= copyear ): LOGGER.debug( 'date found for pattern "%s": %s', SIMPLE_PATTERN, bestmatch[0] ) return dateobject.strftime(options.format) return None
[docs] def find_date( htmlobject: Union[bytes, str, HtmlElement], extensive_search: bool = True, original_date: bool = False, outputformat: str = "%Y-%m-%d", url: Optional[str] = None, verbose: bool = False, min_date: Optional[Union[datetime, str]] = None, max_date: Optional[Union[datetime, str]] = None, deferred_url_extractor: bool = False, ) -> Optional[str]: """ Extract dates from HTML documents using markup analysis and text patterns :param htmlobject: Two possibilities: 1. HTML document (e.g. body of HTTP request or .html-file) in text string form or LXML parsed tree or 2. URL string (gets detected automatically) :type htmlobject: string or lxml tree :param extensive_search: Activate pattern-based opportunistic text search :type extensive_search: boolean :param original_date: Look for original date (e.g. publication date) instead of most recent one (e.g. last modified, updated time) :type original_date: boolean :param outputformat: Provide a valid datetime format for the returned string (see datetime.strftime()) :type outputformat: string :param url: Provide an URL manually for pattern-searching in URL (in some cases much faster) :type url: string :param verbose: Set verbosity level for debugging :type verbose: boolean :param min_date: Set the earliest acceptable date manually (ISO 8601 YMD format) :type min_date: datetime, string :param max_date: Set the latest acceptable date manually (ISO 8601 YMD format) :type max_date: datetime, string :param deferred_url_extractor: Use url extractor as backup only to prioritize full expressions, e.g. of the type `%Y-%m-%d %H:%M:%S` :type deferred_url_extractor: boolean :return: Returns a valid date expression as a string, or None """ # init if verbose: logging.basicConfig(level=logging.DEBUG) tree = load_html(htmlobject) # safeguards if tree is None: return None if outputformat != "%Y-%m-%d" and not is_valid_format(outputformat): return None # define options and time boundaries options = Extractor( extensive_search, get_max_date(max_date), get_min_date(min_date), original_date, outputformat, ) # unclear what this line is for and it impedes type checking: # find_date.extensive_search = extensive_search # URL url_result = None if url is None: # probe for canonical links urlelem = tree.find('.//link[@rel="canonical"]') if urlelem is not None: url = urlelem.get("href") # direct processing of URL info url_result = extract_url_date(url, options) if url_result is not None and not deferred_url_extractor: return url_result # first try header # then try to use JSON data result = examine_header(tree, options) or json_search(tree, options) if result is not None: return result # deferred processing of URL info (may be moved even further down if necessary) if deferred_url_extractor and url_result is not None: return url_result # try abbr elements abbr_result = examine_abbr_elements( tree, options, ) if abbr_result is not None: return abbr_result # first, prune tree try: search_tree, discarded = discard_unwanted( clean_html(deepcopy(tree), CLEANING_LIST) ) # rare LXML error: no NULL bytes or control characters except ValueError: # pragma: no cover search_tree = tree LOGGER.error("lxml cleaner error") # define expressions + text_content if extensive_search: date_expr = SLOW_PREPEND + DATE_EXPRESSIONS else: date_expr = FAST_PREPEND + DATE_EXPRESSIONS # then look for expressions # and try time elements result = ( examine_date_elements( search_tree, date_expr, options, ) or examine_date_elements( search_tree, ".//title|.//h1", options, ) or examine_time_elements(search_tree, options) ) if result is not None: return result # TODO: decide on this # search in discarded parts (e.g. # for subtree in discarded: # dateresult = examine_date_elements(subtree, DATE_EXPRESSIONS, options) # if dateresult is not None: # return dateresult # robust conversion to string try: htmlstring = tostring(search_tree, pretty_print=False, encoding="unicode") except UnicodeDecodeError: htmlstring = tostring(search_tree, pretty_print=False).decode("utf-8", "ignore") # date regex timestamp rescue # try image elements # precise patterns and idiosyncrasies result = ( pattern_search(htmlstring, TIMESTAMP_PATTERN, options) or img_search(search_tree, options) or idiosyncrasies_search(htmlstring, options) ) if result is not None: return result # last resort if extensive_search: LOGGER.debug("extensive search started") # TODO: further tests & decide according to original_date reference = 0 for segment in FREE_TEXT_EXPRESSIONS(search_tree): segment = segment.strip() if not MIN_SEGMENT_LEN < len(segment) < MAX_SEGMENT_LEN: continue reference = compare_reference(reference, segment, options) converted = check_extracted_reference(reference, options) # return or search page HTML return converted or search_page(htmlstring, options) return None