Source code for htmldate.validators

# pylint:disable-msg=E0611,I1101
"""
Filters for date parsing and date validators.
"""

import logging

from collections import Counter
from datetime import datetime
from functools import lru_cache
from time import mktime
from typing import Match, Optional, Pattern, Union, Counter as Counter_Type

from .settings import CACHE_SIZE, MIN_DATE
from .utils import Extractor


LOGGER = logging.getLogger(__name__)
LOGGER.debug("minimum date setting: %s", MIN_DATE)


@lru_cache(maxsize=CACHE_SIZE)
def is_valid_date(
    date_input: Optional[Union[datetime, str]],
    outputformat: str,
    earliest: datetime,
    latest: datetime,
) -> bool:
    """Validate a string w.r.t. the chosen outputformat and basic heuristics"""
    # safety check
    if date_input is None:
        return False

    # try if date can be parsed using chosen outputformat
    if isinstance(date_input, datetime):
        dateobject = date_input
    else:
        # speed-up
        try:
            if outputformat == "%Y-%m-%d":
                dateobject = datetime(
                    int(date_input[:4]), int(date_input[5:7]), int(date_input[8:10])
                )
            # default
            else:
                dateobject = datetime.strptime(date_input, outputformat)
        except ValueError:
            return False

    # year first, then full validation: not newer than today or stored variable
    if (
        earliest.year <= dateobject.year <= latest.year
        and earliest.timestamp() <= dateobject.timestamp() <= latest.timestamp()
    ):
        return True
    LOGGER.debug("date not valid: %s", date_input)
    return False


@lru_cache(maxsize=16)
def is_valid_format(outputformat: str) -> bool:
    """Validate the output format in the settings"""
    # test with date object
    dateobject = datetime(2017, 9, 1, 0, 0)
    try:
        dateobject.strftime(outputformat)
    # other than ValueError: Python < 3.7 only
    except (NameError, TypeError, ValueError) as err:
        LOGGER.error("wrong output format or type: %s %s", outputformat, err)
        return False
    # test in abstracto
    if not isinstance(outputformat, str) or "%" not in outputformat:
        LOGGER.error("malformed output format: %s", outputformat)
        return False
    return True


def plausible_year_filter(
    htmlstring: str,
    *,
    pattern: Pattern[str],
    yearpat: Pattern[str],
    earliest: datetime,
    latest: datetime,
    incomplete: bool = False,
) -> Counter_Type[str]:
    """Filter the date patterns to find plausible years only"""
    occurrences = Counter(pattern.findall(htmlstring))  # slow!

    for item in list(occurrences):
        year_match = yearpat.search(item)
        if year_match is None:
            LOGGER.debug("not a year pattern: %s", item)
            del occurrences[item]
            continue

        lastdigits = year_match[1]
        if not incomplete:
            potential_year = int(lastdigits)
        else:
            century = "19" if lastdigits[0] == "9" else "20"
            potential_year = int(century + lastdigits)

        if not earliest.year <= potential_year <= latest.year:
            LOGGER.debug("no potential year: %s", item)
            del occurrences[item]

    return occurrences


def compare_values(reference: int, attempt: str, options: Extractor) -> int:
    """Compare the date expression to a reference"""
    try:
        timestamp = int(mktime(datetime.strptime(attempt, options.format).timetuple()))
    except Exception as err:
        LOGGER.debug("datetime.strptime exception: %s for string %s", err, attempt)
        return reference
    if options.original and (reference == 0 or timestamp < reference):
        reference = timestamp
    elif not options.original and timestamp > reference:
        reference = timestamp
    return reference


@lru_cache(maxsize=CACHE_SIZE)
def filter_ymd_candidate(
    bestmatch: Match[str],
    pattern: Pattern[str],
    original_date: bool,
    copyear: int,
    outputformat: str,
    min_date: datetime,
    max_date: datetime,
) -> Optional[str]:
    """Filter free text candidates in the YMD format"""
    if bestmatch is not None:
        pagedate = "-".join([bestmatch[1], bestmatch[2], bestmatch[3]])
        if is_valid_date(pagedate, "%Y-%m-%d", earliest=min_date, latest=max_date) and (
            copyear == 0 or int(bestmatch[1]) >= copyear
        ):
            LOGGER.debug('date found for pattern "%s": %s', pattern, pagedate)
            return convert_date(pagedate, "%Y-%m-%d", outputformat)
            ## TODO: test and improve
            # if original_date is True:
            #    if copyear == 0 or int(bestmatch[1]) <= copyear:
            #        LOGGER.debug('date found for pattern "%s": %s', pattern, pagedate)
            #        return convert_date(pagedate, '%Y-%m-%d', outputformat)
            # else:
            #    if copyear == 0 or int(bestmatch[1]) >= copyear:
            #        LOGGER.debug('date found for pattern "%s": %s', pattern, pagedate)
            #        return convert_date(pagedate, '%Y-%m-%d', outputformat)
    return None


[docs] def convert_date(datestring: str, inputformat: str, outputformat: str) -> str: """Parse date and return string in desired format""" # speed-up (%Y-%m-%d) if inputformat == outputformat: return datestring # date object (speedup) if isinstance(datestring, datetime): return datestring.strftime(outputformat) # normal dateobject = datetime.strptime(datestring, inputformat) return dateobject.strftime(outputformat)
def check_extracted_reference(reference: int, options: Extractor) -> Optional[str]: """Test if the extracted reference date can be returned""" if reference > 0: dateobject = datetime.fromtimestamp(reference) converted = dateobject.strftime(options.format) if is_valid_date( converted, options.format, earliest=options.min, latest=options.max ): return converted return None def check_date_input( date_object: Optional[Union[datetime, str]], default: datetime ) -> datetime: "Check if the input is a usable datetime or ISO date string, return default otherwise" if isinstance(date_object, datetime): return date_object if isinstance(date_object, str): try: return datetime.fromisoformat(date_object) # type: ignore[attr-defined] except ValueError: LOGGER.warning("invalid datetime string: %s", date_object) return default # no input or error thrown def get_min_date(min_date: Optional[Union[datetime, str]]) -> datetime: """Validates the minimum date and/or defaults to earliest plausible date""" return check_date_input(min_date, MIN_DATE) def get_max_date(max_date: Optional[Union[datetime, str]]) -> datetime: """Validates the maximum date and/or defaults to latest plausible date""" return check_date_input(max_date, datetime.now())