""" :mod:`pandas.io.html` is a module containing functionality for dealing with HTML IO. """ from __future__ import annotations from collections import abc import numbers import re from typing import ( TYPE_CHECKING, Iterable, Literal, Pattern, Sequence, cast, ) from pandas._typing import ( FilePath, ReadBuffer, ) from pandas.compat._optional import import_optional_dependency from pandas.errors import ( AbstractMethodError, EmptyDataError, ) from pandas.util._decorators import deprecate_nonkeyword_arguments from pandas.core.dtypes.common import is_list_like from pandas import isna from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.indexes.base import Index from pandas.core.indexes.multi import MultiIndex from pandas.io.common import ( file_exists, get_handle, is_url, stringify_path, urlopen, validate_header_arg, ) from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser if TYPE_CHECKING: from pandas import DataFrame _IMPORTS = False _HAS_BS4 = False _HAS_LXML = False _HAS_HTML5LIB = False def _importers() -> None: # import things we need # but make this done on a first use basis global _IMPORTS if _IMPORTS: return global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB bs4 = import_optional_dependency("bs4", errors="ignore") _HAS_BS4 = bs4 is not None lxml = import_optional_dependency("lxml.etree", errors="ignore") _HAS_LXML = lxml is not None html5lib = import_optional_dependency("html5lib", errors="ignore") _HAS_HTML5LIB = html5lib is not None _IMPORTS = True ############# # READ HTML # ############# _RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str: """ Replace extra whitespace inside of a string with a single space. Parameters ---------- s : str or unicode The string from which to remove extra whitespace. regex : re.Pattern The regular expression to use to remove extra whitespace. Returns ------- subd : str or unicode `s` with all extra whitespace replaced with a single space. """ return regex.sub(" ", s.strip()) def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]: """ Get an iterator given an integer, slice or container. Parameters ---------- skiprows : int, slice, container The iterator to use to skip rows; can also be a slice. Raises ------ TypeError * If `skiprows` is not a slice, integer, or Container Returns ------- it : iterable A proper iterator to use to skip rows of a DataFrame. """ if isinstance(skiprows, slice): start, step = skiprows.start or 0, skiprows.step or 1 return list(range(start, skiprows.stop, step)) elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): return cast("int | Sequence[int]", skiprows) elif skiprows is None: return 0 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") def _read( obj: bytes | FilePath | ReadBuffer[str] | ReadBuffer[bytes], encoding: str | None ) -> str | bytes: """ Try to read from a url, file or string. Parameters ---------- obj : str, unicode, path object, or file-like object Returns ------- raw_text : str """ text: str | bytes if ( is_url(obj) or hasattr(obj, "read") or (isinstance(obj, str) and file_exists(obj)) ): # error: Argument 1 to "get_handle" has incompatible type "Union[str, bytes, # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]"; # expected "Union[PathLike[str], Union[str, Union[IO[Any], RawIOBase, # BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]]" with get_handle( obj, "r", encoding=encoding # type: ignore[arg-type] ) as handles: text = handles.handle.read() elif isinstance(obj, (str, bytes)): text = obj else: raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") return text class _HtmlFrameParser: """ Base class for parsers that parse HTML into DataFrames. Parameters ---------- io : str or file-like This can be either a string of raw HTML, a valid URL using the HTTP, FTP, or FILE protocols or a file-like object. match : str or regex The text to match in the document. attrs : dict List of HTML