from __future__ import annotations from io import ( BytesIO, StringIO, ) from lzma import LZMAError import os from tarfile import ReadError from urllib.error import HTTPError from zipfile import BadZipFile import numpy as np import pytest from pandas.compat import is_ci_environment from pandas.compat._optional import import_optional_dependency from pandas.errors import ( EmptyDataError, ParserError, ) import pandas.util._test_decorators as td from pandas import DataFrame import pandas._testing as tm from pandas.io.common import get_handle from pandas.io.xml import read_xml """ CHECK LIST [x] - ValueError: "Values for parser can only be lxml or etree." etree [X] - ImportError: "lxml not found, please install or use the etree parser." [X] - TypeError: "expected str, bytes or os.PathLike object, not NoneType" [X] - ValueError: "Either element or attributes can be parsed not both." [X] - ValueError: "xpath does not return any nodes..." [X] - SyntaxError: "You have used an incorrect or unsupported XPath" [X] - ValueError: "names does not match length of child elements in xpath." [X] - TypeError: "...is not a valid type for names" [X] - ValueError: "To use stylesheet, you need lxml installed..." [] - URLError: (GENERAL ERROR WITH HTTPError AS SUBCLASS) [X] - HTTPError: "HTTP Error 404: Not Found" [] - OSError: (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS) [X] - FileNotFoundError: "No such file or directory" [] - ParseError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML) [X] - UnicodeDecodeError: "'utf-8' codec can't decode byte 0xe9..." [X] - UnicodeError: "UTF-16 stream does not start with BOM" [X] - BadZipFile: "File is not a zip file" [X] - OSError: "Invalid data stream" [X] - LZMAError: "Input format not supported by decoder" [X] - ValueError: "Unrecognized compression type" [X] - PermissionError: "Forbidden" lxml [X] - ValueError: "Either element or attributes can be parsed not both." [X] - AttributeError: "__enter__" [X] - XSLTApplyError: "Cannot resolve URI" [X] - XSLTParseError: "document is not a stylesheet" [X] - ValueError: "xpath does not return any nodes." [X] - XPathEvalError: "Invalid expression" [] - XPathSyntaxError: (OLD VERSION IN lxml FOR XPATH ERRORS) [X] - TypeError: "empty namespace prefix is not supported in XPath" [X] - ValueError: "names does not match length of child elements in xpath." [X] - TypeError: "...is not a valid type for names" [X] - LookupError: "unknown encoding" [] - URLError: (USUALLY DUE TO NETWORKING) [X - HTTPError: "HTTP Error 404: Not Found" [X] - OSError: "failed to load external entity" [X] - XMLSyntaxError: "Start tag expected, '<' not found" [] - ParserError: (FAILSAFE CATCH ALL FOR VERY COMPLEX XML [X] - ValueError: "Values for parser can only be lxml or etree." [X] - UnicodeDecodeError: "'utf-8' codec can't decode byte 0xe9..." [X] - UnicodeError: "UTF-16 stream does not start with BOM" [X] - BadZipFile: "File is not a zip file" [X] - OSError: "Invalid data stream" [X] - LZMAError: "Input format not supported by decoder" [X] - ValueError: "Unrecognized compression type" [X] - PermissionError: "Forbidden" """ geom_df = DataFrame( { "shape": ["square", "circle", "triangle"], "degrees": [360, 360, 180], "sides": [4, np.nan, 3], } ) xml_default_nmsp = """\ square 360 4 circle 360 triangle 180 3 """ xml_prefix_nmsp = """\ square 360 4.0 circle 360 triangle 180 3.0 """ df_kml = DataFrame( { "id": { 0: "ID_00001", 1: "ID_00002", 2: "ID_00003", 3: "ID_00004", 4: "ID_00005", }, "name": { 0: "Blue Line (Forest Park)", 1: "Red, Purple Line", 2: "Red, Purple Line", 3: "Red, Purple Line", 4: "Red, Purple Line", }, "styleUrl": { 0: "#LineStyle01", 1: "#LineStyle01", 2: "#LineStyle01", 3: "#LineStyle01", 4: "#LineStyle01", }, "extrude": {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}, "altitudeMode": { 0: "clampedToGround", 1: "clampedToGround", 2: "clampedToGround", 3: "clampedToGround", 4: "clampedToGround", }, "coordinates": { 0: ( "-87.77678526964958,41.8708863930319,0 " "-87.77826234150609,41.87097820122218,0 " "-87.78251583439344,41.87130129991005,0 " "-87.78418294588424,41.87145055520308,0 " "-87.7872369165933,41.8717239119163,0 " "-87.79160214925886,41.87210797280065,0" ), 1: ( "-87.65758750947528,41.96427269188822,0 " "-87.65802133507393,41.96581929055245,0 " "-87.65819033925305,41.96621846093642,0 " "-87.6583189819129,41.96650362897086,0 " "-87.65835858701473,41.96669002089185,0 " "-87.65838428411853,41.96688150295095,0 " "-87.65842208882658,41.96745896091846,0 " "-87.65846556843937,41.9683761425439,0 " "-87.65849296214573,41.96913893870342,0" ), 2: ( "-87.65492939166126,41.95377494531437,0 " "-87.65557043199591,41.95376544118533,0 " "-87.65606302030132,41.95376391658746,0 " "-87.65623502146268,41.95377379126367,0 " "-87.65634748981634,41.95380103566435,0 " "-87.65646537904269,41.95387703994676,0 " "-87.65656532461145,41.95396622645799,0 " "-87.65664760856414,41.95404201996044,0 " "-87.65671750555913,41.95416647054043,0 " "-87.65673983607117,41.95429949810849,0 " "-87.65673866475777,41.95441024240925,0 " "-87.6567690255541,41.95490657227902,0 " "-87.65683672482363,41.95692259283837,0 " "-87.6568900886376,41.95861070983142,0 " "-87.65699865558875,41.96181418669004,0 " "-87.65756347177603,41.96397045777844,0 " "-87.65758750947528,41.96427269188822,0" ), 3: ( "-87.65362593118043,41.94742799535678,0 " "-87.65363554415794,41.94819886386848,0 " "-87.6536456393239,41.95059994675451,0 " "-87.65365831235026,41.95108288489359,0 " "-87.6536604873874,41.9519954657554,0 " "-87.65362592053201,41.95245597302328,0 " "-87.65367158496069,41.95311153649393,0 " "-87.65368468595476,41.9533202828916,0 " "-87.65369271253692,41.95343095587119,0 " "-87.65373335834569,41.95351536301472,0 " "-87.65378605844126,41.95358212680591,0 " "-87.65385067928185,41.95364452823767,0 " "-87.6539390793817,41.95370263886964,0 " "-87.6540786298351,41.95373403675265,0 " "-87.65430648647626,41.9537535411832,0 " "-87.65492939166126,41.95377494531437,0" ), 4: ( "-87.65345391792157,41.94217681262115,0 " "-87.65342448305786,41.94237224420864,0 " "-87.65339745703922,41.94268217746244,0 " "-87.65337753982941,41.94288140770284,0 " "-87.65336256753105,41.94317369618263,0 " "-87.65338799707138,41.94357253961736,0 " "-87.65340240886648,41.94389158188269,0 " "-87.65341837392448,41.94406444407721,0 " "-87.65342275247338,41.94421065714904,0 " "-87.65347469646018,41.94434829382345,0 " "-87.65351486483024,41.94447699917548,0 " "-87.65353483605053,41.9453896864472,0 " "-87.65361975532807,41.94689193720703,0 " "-87.65362593118043,41.94742799535678,0" ), }, } ) @pytest.fixture(params=["rb", "r"]) def mode(request): return request.param @pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"]) def parser(request): return request.param def read_xml_iterparse(data, **kwargs): with tm.ensure_clean() as path: with open(path, "w") as f: f.write(data) return read_xml(path, **kwargs) def read_xml_iterparse_comp(comp_path, compression_only, **kwargs): with get_handle(comp_path, "r", compression=compression_only) as handles: with tm.ensure_clean() as path: with open(path, "w") as f: f.write(handles.handle.read()) return read_xml(path, **kwargs) # FILE / URL @td.skip_if_no("lxml") def test_parser_consistency_file(datapath): filename = datapath("io", "data", "xml", "books.xml") df_file_lxml = read_xml(filename, parser="lxml") df_file_etree = read_xml(filename, parser="etree") df_iter_lxml = read_xml( filename, parser="lxml", iterparse={"book": ["category", "title", "year", "author", "price"]}, ) df_iter_etree = read_xml( filename, parser="etree", iterparse={"book": ["category", "title", "year", "author", "price"]}, ) tm.assert_frame_equal(df_file_lxml, df_file_etree) tm.assert_frame_equal(df_file_lxml, df_iter_lxml) tm.assert_frame_equal(df_iter_lxml, df_iter_etree) @pytest.mark.network @pytest.mark.slow @tm.network( url=( "https://data.cityofchicago.org/api/views/" "8pix-ypme/rows.xml?accessType=DOWNLOAD" ), check_before_test=True, ) def test_parser_consistency_url(parser): url = ( "https://data.cityofchicago.org/api/views/" "8pix-ypme/rows.xml?accessType=DOWNLOAD" ) with tm.ensure_clean(filename="cta.xml") as path: (read_xml(url, xpath=".//row/row", parser=parser).to_xml(path, index=False)) df_xpath = read_xml(path, parser=parser) df_iter = read_xml( path, parser=parser, iterparse={ "row": [ "_id", "_uuid", "_position", "_address", "stop_id", "direction_id", "stop_name", "station_name", "station_descriptive_name", "map_id", "ada", "red", "blue", "g", "brn", "p", "pexp", "y", "pnk", "o", "location", ] }, ) tm.assert_frame_equal(df_xpath, df_iter) def test_file_like(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") with open(filename, mode) as f: df_file = read_xml(f, parser=parser) df_expected = DataFrame( { "category": ["cooking", "children", "web"], "title": ["Everyday Italian", "Harry Potter", "Learning XML"], "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], "year": [2005, 2005, 2003], "price": [30.00, 29.99, 39.95], } ) tm.assert_frame_equal(df_file, df_expected) def test_file_io(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") with open(filename, mode) as f: xml_obj = f.read() df_io = read_xml( (BytesIO(xml_obj) if isinstance(xml_obj, bytes) else StringIO(xml_obj)), parser=parser, ) df_expected = DataFrame( { "category": ["cooking", "children", "web"], "title": ["Everyday Italian", "Harry Potter", "Learning XML"], "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], "year": [2005, 2005, 2003], "price": [30.00, 29.99, 39.95], } ) tm.assert_frame_equal(df_io, df_expected) def test_file_buffered_reader_string(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") with open(filename, mode) as f: xml_obj = f.read() df_str = read_xml(xml_obj, parser=parser) df_expected = DataFrame( { "category": ["cooking", "children", "web"], "title": ["Everyday Italian", "Harry Potter", "Learning XML"], "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], "year": [2005, 2005, 2003], "price": [30.00, 29.99, 39.95], } ) tm.assert_frame_equal(df_str, df_expected) def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") with open(filename, mode) as f: next(f) xml_obj = f.read() df_str = read_xml(xml_obj, parser=parser) df_expected = DataFrame( { "category": ["cooking", "children", "web"], "title": ["Everyday Italian", "Harry Potter", "Learning XML"], "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], "year": [2005, 2005, 2003], "price": [30.00, 29.99, 39.95], } ) tm.assert_frame_equal(df_str, df_expected) def test_string_charset(parser): txt = "<中文標籤>12" df_str = read_xml(txt, parser=parser) df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0]) tm.assert_frame_equal(df_str, df_expected) def test_file_charset(datapath, parser): xml_file = datapath("io", "data", "xml", "doc_ch_utf.xml") df_file = read_xml(datapath(xml_file), parser=parser) df_expected = DataFrame( { "問": [ "問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正", "問 既破有得申無得 亦應但破性執申假名以不", "問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶", ], "答": [ "答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申\n\t\t故", None, "答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破", ], "a": [None, "答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也", None], } ) tm.assert_frame_equal(df_file, df_expected) def test_file_handle_close(datapath, parser): xml_file = datapath("io", "data", "xml", "books.xml") with open(xml_file, "rb") as f: read_xml(BytesIO(f.read()), parser=parser) assert not f.closed @td.skip_if_no("lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_string_lxml(val): from lxml.etree import XMLSyntaxError msg = "|".join( [ "Document is empty", # Seen on Mac with lxml 4.91 r"None \(line 0\)", ] ) with pytest.raises(XMLSyntaxError, match=msg): read_xml(val, parser="lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_string_etree(val): from xml.etree.ElementTree import ParseError with pytest.raises(ParseError, match="no element found"): read_xml(val, parser="etree") @td.skip_if_no("lxml") def test_wrong_file_path_lxml(): from lxml.etree import XMLSyntaxError filename = os.path.join("data", "html", "books.xml") with pytest.raises( XMLSyntaxError, match=("Start tag expected, '<' not found"), ): read_xml(filename, parser="lxml") def test_wrong_file_path_etree(): from xml.etree.ElementTree import ParseError filename = os.path.join("data", "html", "books.xml") with pytest.raises( ParseError, match=("not well-formed"), ): read_xml(filename, parser="etree") @pytest.mark.network @tm.network( url="https://www.w3schools.com/xml/books.xml", check_before_test=True, ) @td.skip_if_no("lxml") def test_url(): url = "https://www.w3schools.com/xml/books.xml" df_url = read_xml(url, xpath=".//book[count(*)=4]") df_expected = DataFrame( { "category": ["cooking", "children", "web"], "title": ["Everyday Italian", "Harry Potter", "Learning XML"], "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], "year": [2005, 2005, 2003], "price": [30.00, 29.99, 39.95], "cover": [None, None, "paperback"], } ) tm.assert_frame_equal(df_url, df_expected) @pytest.mark.network @tm.network(url="https://www.w3schools.com/xml/python.xml", check_before_test=True) def test_wrong_url(parser): with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")): url = "https://www.w3schools.com/xml/python.xml" read_xml(url, xpath=".//book[count(*)=4]", parser=parser) # XPATH @td.skip_if_no("lxml") def test_empty_xpath_lxml(datapath): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(ValueError, match=("xpath does not return any nodes")): read_xml(filename, xpath=".//python", parser="lxml") def test_bad_xpath_etree(datapath): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises( SyntaxError, match=("You have used an incorrect or unsupported XPath") ): read_xml(filename, xpath=".//[book]", parser="etree") @td.skip_if_no("lxml") def test_bad_xpath_lxml(datapath): from lxml.etree import XPathEvalError filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(XPathEvalError, match=("Invalid expression")): read_xml(filename, xpath=".//[book]", parser="lxml") # NAMESPACE def test_default_namespace(parser): df_nmsp = read_xml( xml_default_nmsp, xpath=".//ns:row", namespaces={"ns": "http://example.com"}, parser=parser, ) df_iter = read_xml_iterparse( xml_default_nmsp, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}, ) df_expected = DataFrame( { "shape": ["square", "circle", "triangle"], "degrees": [360, 360, 180], "sides": [4.0, float("nan"), 3.0], } ) tm.assert_frame_equal(df_nmsp, df_expected) tm.assert_frame_equal(df_iter, df_expected) def test_prefix_namespace(parser): df_nmsp = read_xml( xml_prefix_nmsp, xpath=".//doc:row", namespaces={"doc": "http://example.com"}, parser=parser, ) df_iter = read_xml_iterparse( xml_prefix_nmsp, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]} ) df_expected = DataFrame( { "shape": ["square", "circle", "triangle"], "degrees": [360, 360, 180], "sides": [4.0, float("nan"), 3.0], } ) tm.assert_frame_equal(df_nmsp, df_expected) tm.assert_frame_equal(df_iter, df_expected) @td.skip_if_no("lxml") def test_consistency_default_namespace(): df_lxml = read_xml( xml_default_nmsp, xpath=".//ns:row", namespaces={"ns": "http://example.com"}, parser="lxml", ) df_etree = read_xml( xml_default_nmsp, xpath=".//doc:row", namespaces={"doc": "http://example.com"}, parser="etree", ) tm.assert_frame_equal(df_lxml, df_etree) @td.skip_if_no("lxml") def test_consistency_prefix_namespace(): df_lxml = read_xml( xml_prefix_nmsp, xpath=".//doc:row", namespaces={"doc": "http://example.com"}, parser="lxml", ) df_etree = read_xml( xml_prefix_nmsp, xpath=".//doc:row", namespaces={"doc": "http://example.com"}, parser="etree", ) tm.assert_frame_equal(df_lxml, df_etree) # PREFIX def test_missing_prefix_with_default_namespace(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(ValueError, match=("xpath does not return any nodes")): read_xml(filename, xpath=".//Placemark", parser=parser) def test_missing_prefix_definition_etree(datapath): filename = datapath("io", "data", "xml", "cta_rail_lines.kml") with pytest.raises(SyntaxError, match=("you used an undeclared namespace prefix")): read_xml(filename, xpath=".//kml:Placemark", parser="etree") @td.skip_if_no("lxml") def test_missing_prefix_definition_lxml(datapath): from lxml.etree import XPathEvalError filename = datapath("io", "data", "xml", "cta_rail_lines.kml") with pytest.raises(XPathEvalError, match=("Undefined namespace prefix")): read_xml(filename, xpath=".//kml:Placemark", parser="lxml") @td.skip_if_no("lxml") @pytest.mark.parametrize("key", ["", None]) def test_none_namespace_prefix(key): with pytest.raises( TypeError, match=("empty namespace prefix is not supported in XPath") ): read_xml( xml_default_nmsp, xpath=".//kml:Placemark", namespaces={key: "http://www.opengis.net/kml/2.2"}, parser="lxml", ) # ELEMS AND ATTRS def test_file_elems_and_attrs(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) df_iter = read_xml( filename, parser=parser, iterparse={"book": ["category", "title", "author", "year", "price"]}, ) df_expected = DataFrame( { "category": ["cooking", "children", "web"], "title": ["Everyday Italian", "Harry Potter", "Learning XML"], "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], "year": [2005, 2005, 2003], "price": [30.00, 29.99, 39.95], } ) tm.assert_frame_equal(df_file, df_expected) tm.assert_frame_equal(df_iter, df_expected) def test_file_only_attrs(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, attrs_only=True, parser=parser) df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]}) df_expected = DataFrame({"category": ["cooking", "children", "web"]}) tm.assert_frame_equal(df_file, df_expected) tm.assert_frame_equal(df_iter, df_expected) def test_file_only_elems(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, elems_only=True, parser=parser) df_iter = read_xml( filename, parser=parser, iterparse={"book": ["title", "author", "year", "price"]}, ) df_expected = DataFrame( { "title": ["Everyday Italian", "Harry Potter", "Learning XML"], "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], "year": [2005, 2005, 2003], "price": [30.00, 29.99, 39.95], } ) tm.assert_frame_equal(df_file, df_expected) tm.assert_frame_equal(df_iter, df_expected) def test_elem_and_attrs_only(datapath, parser): filename = datapath("io", "data", "xml", "cta_rail_lines.kml") with pytest.raises( ValueError, match=("Either element or attributes can be parsed not both"), ): read_xml(filename, elems_only=True, attrs_only=True, parser=parser) @td.skip_if_no("lxml") def test_attribute_centric_xml(): xml = """\ """ df_lxml = read_xml(xml, xpath=".//station") df_etree = read_xml(xml, xpath=".//station", parser="etree") df_iter_lx = read_xml_iterparse(xml, iterparse={"station": ["Name", "coords"]}) df_iter_et = read_xml_iterparse( xml, parser="etree", iterparse={"station": ["Name", "coords"]} ) tm.assert_frame_equal(df_lxml, df_etree) tm.assert_frame_equal(df_iter_lx, df_iter_et) # NAMES def test_names_option_output(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml( filename, names=["Col1", "Col2", "Col3", "Col4", "Col5"], parser=parser ) df_iter = read_xml( filename, parser=parser, names=["Col1", "Col2", "Col3", "Col4", "Col5"], iterparse={"book": ["category", "title", "author", "year", "price"]}, ) df_expected = DataFrame( { "Col1": ["cooking", "children", "web"], "Col2": ["Everyday Italian", "Harry Potter", "Learning XML"], "Col3": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], "Col4": [2005, 2005, 2003], "Col5": [30.00, 29.99, 39.95], } ) tm.assert_frame_equal(df_file, df_expected) tm.assert_frame_equal(df_iter, df_expected) def test_repeat_names(parser): xml = """\ circle curved sphere curved """ df_xpath = read_xml( xml, xpath=".//shape", parser=parser, names=["type_dim", "shape", "type_edge"] ) df_iter = read_xml_iterparse( xml, parser=parser, iterparse={"shape": ["type", "name", "type"]}, names=["type_dim", "shape", "type_edge"], ) df_expected = DataFrame( { "type_dim": ["2D", "3D"], "shape": ["circle", "sphere"], "type_edge": ["curved", "curved"], } ) tm.assert_frame_equal(df_xpath, df_expected) tm.assert_frame_equal(df_iter, df_expected) def test_repeat_values_new_names(parser): xml = """\ rectangle rectangle square rectangle ellipse ellipse circle ellipse """ df_xpath = read_xml(xml, xpath=".//shape", parser=parser, names=["name", "group"]) df_iter = read_xml_iterparse( xml, parser=parser, iterparse={"shape": ["name", "family"]}, names=["name", "group"], ) df_expected = DataFrame( { "name": ["rectangle", "square", "ellipse", "circle"], "group": ["rectangle", "rectangle", "ellipse", "ellipse"], } ) tm.assert_frame_equal(df_xpath, df_expected) tm.assert_frame_equal(df_iter, df_expected) def test_names_option_wrong_length(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(ValueError, match=("names does not match length")): read_xml(filename, names=["Col1", "Col2", "Col3"], parser=parser) def test_names_option_wrong_type(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(TypeError, match=("is not a valid type for names")): read_xml(filename, names="Col1, Col2, Col3", parser=parser) # ENCODING def test_wrong_encoding(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") with pytest.raises(UnicodeDecodeError, match=("'utf-8' codec can't decode")): read_xml(filename, parser=parser) def test_utf16_encoding(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") with pytest.raises( UnicodeError, match=( "UTF-16 stream does not start with BOM|" "'utf-16-le' codec can't decode byte" ), ): read_xml(filename, encoding="UTF-16", parser=parser) def test_unknown_encoding(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") with pytest.raises(LookupError, match=("unknown encoding: UFT-8")): read_xml(filename, encoding="UFT-8", parser=parser) def test_ascii_encoding(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") with pytest.raises(UnicodeDecodeError, match=("'ascii' codec can't decode byte")): read_xml(filename, encoding="ascii", parser=parser) @td.skip_if_no("lxml") def test_parser_consistency_with_encoding(datapath): filename = datapath("io", "data", "xml", "baby_names.xml") df_xpath_lxml = read_xml(filename, parser="lxml", encoding="ISO-8859-1") df_xpath_etree = read_xml(filename, parser="etree", encoding="iso-8859-1") df_iter_lxml = read_xml( filename, parser="lxml", encoding="ISO-8859-1", iterparse={"row": ["rank", "malename", "femalename"]}, ) df_iter_etree = read_xml( filename, parser="etree", encoding="ISO-8859-1", iterparse={"row": ["rank", "malename", "femalename"]}, ) tm.assert_frame_equal(df_xpath_lxml, df_xpath_etree) tm.assert_frame_equal(df_xpath_etree, df_iter_etree) tm.assert_frame_equal(df_iter_lxml, df_iter_etree) @td.skip_if_no("lxml") def test_wrong_encoding_for_lxml(): # GH#45133 data = """ c """ with pytest.raises(TypeError, match="encoding None"): read_xml(StringIO(data), parser="lxml", encoding=None) def test_none_encoding_etree(): # GH#45133 data = """ c """ result = read_xml(StringIO(data), parser="etree", encoding=None) expected = DataFrame({"a": ["c"]}) tm.assert_frame_equal(result, expected) # PARSER @td.skip_if_installed("lxml") def test_default_parser_no_lxml(datapath): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises( ImportError, match=("lxml not found, please install or use the etree parser.") ): read_xml(filename) def test_wrong_parser(datapath): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises( ValueError, match=("Values for parser can only be lxml or etree.") ): read_xml(filename, parser="bs4") # STYLESHEET @td.skip_if_no("lxml") def test_stylesheet_file(datapath): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") df_style = read_xml( kml, xpath=".//k:Placemark", namespaces={"k": "http://www.opengis.net/kml/2.2"}, stylesheet=xsl, ) df_iter = read_xml( kml, iterparse={ "Placemark": [ "id", "name", "styleUrl", "extrude", "altitudeMode", "coordinates", ] }, ) tm.assert_frame_equal(df_kml, df_style) tm.assert_frame_equal(df_kml, df_iter) def test_read_xml_passing_as_positional_deprecated(datapath, parser): # GH#45133 kml = datapath("io", "data", "xml", "cta_rail_lines.kml") with tm.assert_produces_warning(FutureWarning, match="keyword-only"): read_xml( kml, ".//k:Placemark", namespaces={"k": "http://www.opengis.net/kml/2.2"}, parser=parser, ) @td.skip_if_no("lxml") def test_stylesheet_file_like(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") with open(xsl, mode) as f: df_style = read_xml( kml, xpath=".//k:Placemark", namespaces={"k": "http://www.opengis.net/kml/2.2"}, stylesheet=f, ) tm.assert_frame_equal(df_kml, df_style) @td.skip_if_no("lxml") def test_stylesheet_io(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") xsl_obj: BytesIO | StringIO with open(xsl, mode) as f: if mode == "rb": xsl_obj = BytesIO(f.read()) else: xsl_obj = StringIO(f.read()) df_style = read_xml( kml, xpath=".//k:Placemark", namespaces={"k": "http://www.opengis.net/kml/2.2"}, stylesheet=xsl_obj, ) tm.assert_frame_equal(df_kml, df_style) @td.skip_if_no("lxml") def test_stylesheet_buffered_reader(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") with open(xsl, mode) as f: xsl_obj = f.read() df_style = read_xml( kml, xpath=".//k:Placemark", namespaces={"k": "http://www.opengis.net/kml/2.2"}, stylesheet=xsl_obj, ) tm.assert_frame_equal(df_kml, df_style) @td.skip_if_no("lxml") def test_style_charset(): xml = "<中文標籤>12" xsl = """\ <根> """ df_orig = read_xml(xml) df_style = read_xml(xml, stylesheet=xsl) tm.assert_frame_equal(df_orig, df_style) @td.skip_if_no("lxml") def test_not_stylesheet(datapath): from lxml.etree import XSLTParseError kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "books.xml") with pytest.raises(XSLTParseError, match=("document is not a stylesheet")): read_xml(kml, stylesheet=xsl) @td.skip_if_no("lxml") def test_incorrect_xsl_syntax(datapath): from lxml.etree import XMLSyntaxError xsl = """\ """ kml = datapath("io", "data", "xml", "cta_rail_lines.kml") with pytest.raises( XMLSyntaxError, match=("Extra content at the end of the document") ): read_xml(kml, stylesheet=xsl) @td.skip_if_no("lxml") def test_incorrect_xsl_eval(datapath): from lxml.etree import XSLTParseError xsl = """\ """ kml = datapath("io", "data", "xml", "cta_rail_lines.kml") with pytest.raises(XSLTParseError, match=("failed to compile")): read_xml(kml, stylesheet=xsl) @td.skip_if_no("lxml") def test_incorrect_xsl_apply(datapath): from lxml.etree import XSLTApplyError xsl = """\ """ kml = datapath("io", "data", "xml", "cta_rail_lines.kml") with pytest.raises(XSLTApplyError, match=("Cannot resolve URI")): read_xml(kml, stylesheet=xsl) @td.skip_if_no("lxml") def test_wrong_stylesheet(): from lxml.etree import XMLSyntaxError kml = os.path.join("data", "xml", "cta_rail_lines.kml") xsl = os.path.join("data", "xml", "flatten.xsl") with pytest.raises( XMLSyntaxError, match=("Start tag expected, '<' not found"), ): read_xml(kml, stylesheet=xsl) @td.skip_if_no("lxml") def test_stylesheet_file_close(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") xsl_obj: BytesIO | StringIO with open(xsl, mode) as f: if mode == "rb": xsl_obj = BytesIO(f.read()) else: xsl_obj = StringIO(f.read()) read_xml(kml, stylesheet=xsl_obj) assert not f.closed @td.skip_if_no("lxml") def test_stylesheet_with_etree(): kml = os.path.join("data", "xml", "cta_rail_lines.kml") xsl = os.path.join("data", "xml", "flatten_doc.xsl") with pytest.raises( ValueError, match=("To use stylesheet, you need lxml installed") ): read_xml(kml, parser="etree", stylesheet=xsl) @td.skip_if_no("lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_stylesheet(val): from lxml.etree import XMLSyntaxError kml = os.path.join("data", "xml", "cta_rail_lines.kml") with pytest.raises( XMLSyntaxError, match=("Document is empty|Start tag expected, '<' not found") ): read_xml(kml, stylesheet=val) # ITERPARSE def test_string_error(parser): with pytest.raises( ParserError, match=("iterparse is designed for large XML files") ): read_xml( xml_default_nmsp, parser=parser, iterparse={"row": ["shape", "degrees", "sides", "date"]}, ) def test_file_like_error(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises( ParserError, match=("iterparse is designed for large XML files") ): with open(filename) as f: read_xml( f, parser=parser, iterparse={"book": ["category", "title", "year", "author", "price"]}, ) @pytest.mark.network @tm.network(url="https://www.w3schools.com/xml/books.xml", check_before_test=True) def test_url_path_error(parser): url = "https://www.w3schools.com/xml/books.xml" with pytest.raises( ParserError, match=("iterparse is designed for large XML files") ): read_xml( url, parser=parser, iterparse={"row": ["shape", "degrees", "sides", "date"]}, ) def test_compression_error(parser, compression_only): with tm.ensure_clean(filename="geom_xml.zip") as path: geom_df.to_xml(path, parser=parser, compression=compression_only) with pytest.raises( ParserError, match=("iterparse is designed for large XML files") ): read_xml( path, parser=parser, iterparse={"row": ["shape", "degrees", "sides", "date"]}, compression=compression_only, ) def test_wrong_dict_type(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(TypeError, match="list is not a valid type for iterparse"): read_xml( filename, parser=parser, iterparse=["category", "title", "year", "author", "price"], ) def test_wrong_dict_value(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises( TypeError, match=" is not a valid type for value in iterparse" ): read_xml(filename, parser=parser, iterparse={"book": "category"}) def test_bad_xml(parser): bad_xml = """\ square 00360 4.0 2020-01-01 circle 00360 2021-01-01 triangle 00180 3.0 2022-01-01 """ with tm.ensure_clean(filename="bad.xml") as path: with open(path, "w") as f: f.write(bad_xml) with pytest.raises( SyntaxError, match=( "Extra content at the end of the document|" "junk after document element" ), ): read_xml( path, parser=parser, parse_dates=["date"], iterparse={"row": ["shape", "degrees", "sides", "date"]}, ) def test_comment(parser): xml = """\ circle 2D sphere 3D """ df_xpath = read_xml(xml, xpath=".//shape", parser=parser) df_iter = read_xml_iterparse( xml, parser=parser, iterparse={"shape": ["name", "type"]} ) df_expected = DataFrame( { "name": ["circle", "sphere"], "type": ["2D", "3D"], } ) tm.assert_frame_equal(df_xpath, df_expected) tm.assert_frame_equal(df_iter, df_expected) def test_dtd(parser): xml = """\ ]> circle 2D sphere 3D """ df_xpath = read_xml(xml, xpath=".//shape", parser=parser) df_iter = read_xml_iterparse( xml, parser=parser, iterparse={"shape": ["name", "type"]} ) df_expected = DataFrame( { "name": ["circle", "sphere"], "type": ["2D", "3D"], } ) tm.assert_frame_equal(df_xpath, df_expected) tm.assert_frame_equal(df_iter, df_expected) def test_processing_instruction(parser): xml = """\ , , ?> circle 2D sphere 3D """ df_xpath = read_xml(xml, xpath=".//shape", parser=parser) df_iter = read_xml_iterparse( xml, parser=parser, iterparse={"shape": ["name", "type"]} ) df_expected = DataFrame( { "name": ["circle", "sphere"], "type": ["2D", "3D"], } ) tm.assert_frame_equal(df_xpath, df_expected) tm.assert_frame_equal(df_iter, df_expected) def test_no_result(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises( ParserError, match="No result from selected items in iterparse." ): read_xml( filename, parser=parser, iterparse={"node": ["attr1", "elem1", "elem2", "elem3"]}, ) def test_empty_data(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(EmptyDataError, match="No columns to parse from file"): read_xml( filename, parser=parser, iterparse={"book": ["attr1", "elem1", "elem2", "elem3"]}, ) @pytest.mark.network @td.skip_if_no("lxml") @tm.network( url="https://www.w3schools.com/xml/cdcatalog_with_xsl.xml", check_before_test=True ) def test_online_stylesheet(): xml = "https://www.w3schools.com/xml/cdcatalog_with_xsl.xml" xsl = "https://www.w3schools.com/xml/cdcatalog.xsl" df_xsl = read_xml( xml, xpath=".//tr[td and position() <= 6]", names=["title", "artist"], stylesheet=xsl, ) df_expected = DataFrame( { "title": { 0: "Empire Burlesque", 1: "Hide your heart", 2: "Greatest Hits", 3: "Still got the blues", 4: "Eros", }, "artist": { 0: "Bob Dylan", 1: "Bonnie Tyler", 2: "Dolly Parton", 3: "Gary Moore", 4: "Eros Ramazzotti", }, } ) tm.assert_frame_equal(df_expected, df_xsl) # COMPRESSION def test_compression_read(parser, compression_only): with tm.ensure_clean() as comp_path: geom_df.to_xml( comp_path, index=False, parser=parser, compression=compression_only ) df_xpath = read_xml(comp_path, parser=parser, compression=compression_only) df_iter = read_xml_iterparse_comp( comp_path, compression_only, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}, compression=compression_only, ) tm.assert_frame_equal(df_xpath, geom_df) tm.assert_frame_equal(df_iter, geom_df) def test_wrong_compression(parser, compression, compression_only): actual_compression = compression attempted_compression = compression_only if actual_compression == attempted_compression: return errors = { "bz2": (OSError, "Invalid data stream"), "gzip": (OSError, "Not a gzipped file"), "zip": (BadZipFile, "File is not a zip file"), "tar": (ReadError, "file could not be opened successfully"), } zstd = import_optional_dependency("zstandard", errors="ignore") if zstd is not None: errors["zstd"] = (zstd.ZstdError, "Unknown frame descriptor") lzma = import_optional_dependency("lzma", errors="ignore") if lzma is not None: errors["xz"] = (LZMAError, "Input format not supported by decoder") error_cls, error_str = errors[attempted_compression] with tm.ensure_clean() as path: geom_df.to_xml(path, parser=parser, compression=actual_compression) with pytest.raises(error_cls, match=error_str): read_xml(path, parser=parser, compression=attempted_compression) def test_unsuported_compression(parser): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean() as path: read_xml(path, parser=parser, compression="7z") # STORAGE OPTIONS @pytest.mark.network @td.skip_if_no("s3fs") @td.skip_if_no("lxml") @pytest.mark.skipif( is_ci_environment(), reason="2022.1.17: Hanging on the CI min versions build.", ) @tm.network def test_s3_parser_consistency(): # Python Software Foundation (2019 IRS-990 RETURN) s3 = "s3://irs-form-990/201923199349319487_public.xml" df_lxml = read_xml( s3, xpath=".//irs:Form990PartVIISectionAGrp", namespaces={"irs": "http://www.irs.gov/efile"}, parser="lxml", storage_options={"anon": True}, ) df_etree = read_xml( s3, xpath=".//irs:Form990PartVIISectionAGrp", namespaces={"irs": "http://www.irs.gov/efile"}, parser="etree", storage_options={"anon": True}, ) tm.assert_frame_equal(df_lxml, df_etree)