Initial Debugging Completed and Execution Successful
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,382 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import parsers as libparsers
|
||||
from pandas.errors import DtypeWarning
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [0, "index"])
|
||||
def test_read_chunksize_with_index(all_parsers, index_col):
|
||||
parser = all_parsers
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["foo", 2, 3, 4, 5],
|
||||
["bar", 7, 8, 9, 10],
|
||||
["baz", 12, 13, 14, 15],
|
||||
["qux", 12, 13, 14, 15],
|
||||
["foo2", 12, 13, 14, 15],
|
||||
["bar2", 12, 13, 14, 15],
|
||||
],
|
||||
columns=["index", "A", "B", "C", "D"],
|
||||
)
|
||||
expected = expected.set_index("index")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
|
||||
list(reader)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
|
||||
chunks = list(reader)
|
||||
tm.assert_frame_equal(chunks[0], expected[:2])
|
||||
tm.assert_frame_equal(chunks[1], expected[2:4])
|
||||
tm.assert_frame_equal(chunks[2], expected[4:])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
|
||||
def test_read_chunksize_bad(all_parsers, chunksize):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
msg = r"'chunksize' must be an integer >=1"
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [2, 8])
|
||||
def test_read_chunksize_and_nrows(all_parsers, chunksize):
|
||||
# see gh-15755
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0, "nrows": 5}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
|
||||
tm.assert_frame_equal(concat(reader), expected)
|
||||
|
||||
|
||||
def test_read_chunksize_and_nrows_changing_size(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0, "nrows": 5}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
|
||||
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
|
||||
tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
|
||||
|
||||
with pytest.raises(StopIteration, match=""):
|
||||
reader.get_chunk(size=3)
|
||||
|
||||
|
||||
def test_get_chunk_passed_chunksize(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
1,2,3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), chunksize=2) as reader:
|
||||
reader.get_chunk()
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), chunksize=2) as reader:
|
||||
result = reader.get_chunk()
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
|
||||
def test_read_chunksize_compat(all_parsers, kwargs):
|
||||
# see gh-12185
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
|
||||
concat(reader)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
|
||||
via_reader = concat(reader)
|
||||
tm.assert_frame_equal(via_reader, result)
|
||||
|
||||
|
||||
def test_read_chunksize_jagged_names(all_parsers):
|
||||
# see gh-23509
|
||||
parser = all_parsers
|
||||
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
|
||||
|
||||
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(
|
||||
StringIO(data), names=range(10), chunksize=4
|
||||
) as reader:
|
||||
concat(reader)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
|
||||
result = concat(reader)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_chunk_begins_with_newline_whitespace(all_parsers):
|
||||
# see gh-10022
|
||||
parser = all_parsers
|
||||
data = "\n hello\nworld\n"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = DataFrame([" hello", "world"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
|
||||
# mainly an issue with the C parser
|
||||
heuristic = 2**3
|
||||
parser = all_parsers
|
||||
integers = [str(i) for i in range(heuristic - 1)]
|
||||
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
|
||||
|
||||
# Coercions should work without warnings.
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
assert type(result.a[0]) is np.float64
|
||||
assert result.a.dtype == float
|
||||
|
||||
|
||||
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
|
||||
warning_type = None
|
||||
parser = all_parsers
|
||||
size = 10000
|
||||
|
||||
# see gh-3866: if chunks are different types and can't
|
||||
# be coerced using numerical types, then issue warning.
|
||||
if parser.engine == "c" and parser.low_memory:
|
||||
warning_type = DtypeWarning
|
||||
# Use larger size to hit warning path
|
||||
size = 499999
|
||||
|
||||
integers = [str(i) for i in range(size)]
|
||||
data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
|
||||
|
||||
buf = StringIO(data)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
df = parser.read_csv(
|
||||
buf,
|
||||
)
|
||||
else:
|
||||
df = parser.read_csv_check_warnings(
|
||||
warning_type,
|
||||
r"Columns \(0\) have mixed types. "
|
||||
"Specify dtype option on import or set low_memory=False.",
|
||||
buf,
|
||||
)
|
||||
if parser.engine == "c" and parser.low_memory:
|
||||
assert df.a.dtype == object
|
||||
elif using_infer_string:
|
||||
assert df.a.dtype == "str"
|
||||
else:
|
||||
assert df.a.dtype == object
|
||||
|
||||
|
||||
@pytest.mark.parametrize("iterator", [True, False])
|
||||
def test_empty_with_nrows_chunksize(all_parsers, iterator):
|
||||
# see gh-9535
|
||||
parser = all_parsers
|
||||
expected = DataFrame(columns=["foo", "bar"])
|
||||
|
||||
nrows = 10
|
||||
data = StringIO("foo,bar\n")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
if iterator:
|
||||
with parser.read_csv(data, chunksize=nrows) as reader:
|
||||
next(iter(reader))
|
||||
else:
|
||||
parser.read_csv(data, nrows=nrows)
|
||||
return
|
||||
|
||||
if iterator:
|
||||
with parser.read_csv(data, chunksize=nrows) as reader:
|
||||
result = next(iter(reader))
|
||||
else:
|
||||
result = parser.read_csv(data, nrows=nrows)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_memory_growth_chunksize(all_parsers):
|
||||
# see gh-24805
|
||||
#
|
||||
# Let's just make sure that we don't crash
|
||||
# as we iteratively process all chunks.
|
||||
parser = all_parsers
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
for i in range(1000):
|
||||
f.write(str(i) + "\n")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(path, chunksize=20) as result:
|
||||
for _ in result:
|
||||
pass
|
||||
return
|
||||
|
||||
with parser.read_csv(path, chunksize=20) as result:
|
||||
for _ in result:
|
||||
pass
|
||||
|
||||
|
||||
def test_chunksize_with_usecols_second_block_shorter(all_parsers):
|
||||
# GH#21211
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4
|
||||
5,6,7,8
|
||||
9,10,11
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
names=["a", "b"],
|
||||
chunksize=2,
|
||||
usecols=[0, 1],
|
||||
header=None,
|
||||
)
|
||||
return
|
||||
|
||||
result_chunks = parser.read_csv(
|
||||
StringIO(data),
|
||||
names=["a", "b"],
|
||||
chunksize=2,
|
||||
usecols=[0, 1],
|
||||
header=None,
|
||||
)
|
||||
|
||||
expected_frames = [
|
||||
DataFrame({"a": [1, 5], "b": [2, 6]}),
|
||||
DataFrame({"a": [9], "b": [10]}, index=[2]),
|
||||
]
|
||||
|
||||
for i, result in enumerate(result_chunks):
|
||||
tm.assert_frame_equal(result, expected_frames[i])
|
||||
|
||||
|
||||
def test_chunksize_second_block_shorter(all_parsers):
|
||||
# GH#21211
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
1,2,3,4
|
||||
5,6,7,8
|
||||
9,10,11
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), chunksize=2)
|
||||
return
|
||||
|
||||
result_chunks = parser.read_csv(StringIO(data), chunksize=2)
|
||||
|
||||
expected_frames = [
|
||||
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
|
||||
DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]),
|
||||
]
|
||||
|
||||
for i, result in enumerate(result_chunks):
|
||||
tm.assert_frame_equal(result, expected_frames[i])
|
||||
@@ -0,0 +1,983 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from datetime import datetime
|
||||
from inspect import signature
|
||||
from io import StringIO
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._config import using_string_dtype
|
||||
|
||||
from pandas.compat import HAS_PYARROW
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Timestamp,
|
||||
compat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import TextFileReader
|
||||
from pandas.io.parsers.c_parser_wrapper import CParserWrapper
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_override_set_noconvert_columns():
|
||||
# see gh-17351
|
||||
#
|
||||
# Usecols needs to be sorted in _set_noconvert_columns based
|
||||
# on the test_usecols_with_parse_dates test from test_usecols.py
|
||||
class MyTextFileReader(TextFileReader):
|
||||
def __init__(self) -> None:
|
||||
self._currow = 0
|
||||
self.squeeze = False
|
||||
|
||||
class MyCParserWrapper(CParserWrapper):
|
||||
def _set_noconvert_columns(self):
|
||||
if self.usecols_dtype == "integer":
|
||||
# self.usecols is a set, which is documented as unordered
|
||||
# but in practice, a CPython set of integers is sorted.
|
||||
# In other implementations this assumption does not hold.
|
||||
# The following code simulates a different order, which
|
||||
# before GH 17351 would cause the wrong columns to be
|
||||
# converted via the parse_dates parameter
|
||||
self.usecols = list(self.usecols)
|
||||
self.usecols.reverse()
|
||||
return CParserWrapper._set_noconvert_columns(self)
|
||||
|
||||
data = """a,b,c,d,e
|
||||
0,1,2014-01-01,09:00,4
|
||||
0,1,2014-01-02,10:00,4"""
|
||||
|
||||
parse_dates = [[1, 2]]
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
|
||||
parser = MyTextFileReader()
|
||||
parser.options = {
|
||||
"usecols": [0, 2, 3],
|
||||
"parse_dates": parse_dates,
|
||||
"delimiter": ",",
|
||||
}
|
||||
parser.engine = "c"
|
||||
parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
|
||||
|
||||
result = parser.read()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_local(all_parsers, csv1):
|
||||
prefix = "file:///" if compat.is_platform_windows() else "file://"
|
||||
parser = all_parsers
|
||||
|
||||
fname = prefix + str(os.path.abspath(csv1))
|
||||
result = parser.read_csv(fname, index_col=0, parse_dates=True)
|
||||
# TODO: make unit check more specific
|
||||
if parser.engine == "pyarrow":
|
||||
result.index = result.index.as_unit("ns")
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.980269, 3.685731, -0.364216805298, -1.159738],
|
||||
[1.047916, -0.041232, -0.16181208307, 0.212549],
|
||||
[0.498581, 0.731168, -0.537677223318, 1.346270],
|
||||
[1.120202, 1.567621, 0.00364077397681, 0.675253],
|
||||
[-0.487094, 0.571455, -1.6116394093, 0.103469],
|
||||
[0.836649, 0.246462, 0.588542635376, 1.062782],
|
||||
[-0.157161, 1.340307, 1.1957779562, -1.097007],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(
|
||||
[
|
||||
datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 4),
|
||||
datetime(2000, 1, 5),
|
||||
datetime(2000, 1, 6),
|
||||
datetime(2000, 1, 7),
|
||||
datetime(2000, 1, 10),
|
||||
datetime(2000, 1, 11),
|
||||
],
|
||||
name="index",
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_1000_sep(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep="|", thousands=",")
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep="|", thousands=",")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_unnamed_columns(all_parsers):
|
||||
data = """A,B,C,,
|
||||
1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
|
||||
dtype=np.int64,
|
||||
columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"],
|
||||
)
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_csv_mixed_type(all_parsers):
|
||||
data = """A,B,C
|
||||
a,1,2
|
||||
b,3,4
|
||||
c,4,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]})
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_low_memory_no_rows_with_index(all_parsers):
|
||||
# see gh-21141
|
||||
parser = all_parsers
|
||||
|
||||
if not parser.low_memory:
|
||||
pytest.skip("This is a low-memory specific test")
|
||||
|
||||
data = """A,B,C
|
||||
1,1,1,2
|
||||
2,2,3,4
|
||||
3,3,4,5
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
|
||||
expected = DataFrame(columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_dataframe(all_parsers, csv1):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(csv1, index_col=0, parse_dates=True)
|
||||
# TODO: make unit check more specific
|
||||
if parser.engine == "pyarrow":
|
||||
result.index = result.index.as_unit("ns")
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.980269, 3.685731, -0.364216805298, -1.159738],
|
||||
[1.047916, -0.041232, -0.16181208307, 0.212549],
|
||||
[0.498581, 0.731168, -0.537677223318, 1.346270],
|
||||
[1.120202, 1.567621, 0.00364077397681, 0.675253],
|
||||
[-0.487094, 0.571455, -1.6116394093, 0.103469],
|
||||
[0.836649, 0.246462, 0.588542635376, 1.062782],
|
||||
[-0.157161, 1.340307, 1.1957779562, -1.097007],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(
|
||||
[
|
||||
datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 4),
|
||||
datetime(2000, 1, 5),
|
||||
datetime(2000, 1, 6),
|
||||
datetime(2000, 1, 7),
|
||||
datetime(2000, 1, 10),
|
||||
datetime(2000, 1, 11),
|
||||
],
|
||||
name="index",
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [3, 3.0])
|
||||
def test_read_nrows(all_parsers, nrows):
|
||||
# see gh-10476
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]],
|
||||
columns=["index", "A", "B", "C", "D"],
|
||||
)
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), nrows=nrows)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), nrows=nrows)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [1.2, "foo", -1])
|
||||
def test_read_nrows_bad(all_parsers, nrows):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
msg = r"'nrows' must be an integer >=0"
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), nrows=nrows)
|
||||
|
||||
|
||||
def test_nrows_skipfooter_errors(all_parsers):
|
||||
msg = "'skipfooter' not supported with 'nrows'"
|
||||
data = "a\n1\n2\n3\n4\n5\n6"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_missing_trailing_delimiters(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
1,2,3,4
|
||||
1,3,3,
|
||||
1,4,5"""
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]],
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skip_initial_space(all_parsers):
|
||||
data = (
|
||||
'"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
|
||||
"1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, "
|
||||
"314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, "
|
||||
"70.06056, 344.98370, 1, 1, -0.689265, -0.692787, "
|
||||
"0.212036, 14.7674, 41.605, -9999.0, -9999.0, "
|
||||
"-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128"
|
||||
)
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
names=list(range(33)),
|
||||
header=None,
|
||||
na_values=["-9999.0"],
|
||||
skipinitialspace=True,
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
names=list(range(33)),
|
||||
header=None,
|
||||
na_values=["-9999.0"],
|
||||
skipinitialspace=True,
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[
|
||||
"09-Apr-2012",
|
||||
"01:10:18.300",
|
||||
2456026.548822908,
|
||||
12849,
|
||||
1.00361,
|
||||
1.12551,
|
||||
330.65659,
|
||||
355626618.16711,
|
||||
73.48821,
|
||||
314.11625,
|
||||
1917.09447,
|
||||
179.71425,
|
||||
80.0,
|
||||
240.0,
|
||||
-350,
|
||||
70.06056,
|
||||
344.9837,
|
||||
1,
|
||||
1,
|
||||
-0.689265,
|
||||
-0.692787,
|
||||
0.212036,
|
||||
14.7674,
|
||||
41.605,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
0,
|
||||
12,
|
||||
128,
|
||||
]
|
||||
]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_trailing_delimiters(all_parsers):
|
||||
# see gh-2442
|
||||
data = """A,B,C
|
||||
1,2,3,
|
||||
4,5,6,
|
||||
7,8,9,"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=False)
|
||||
|
||||
expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_escapechar(all_parsers):
|
||||
# https://stackoverflow.com/questions/13824840/feature-request-for-
|
||||
# pandas-read-csv
|
||||
data = '''SEARCH_TERM,ACTUAL_URL
|
||||
"bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
|
||||
"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
|
||||
"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"'''
|
||||
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8"
|
||||
)
|
||||
|
||||
assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series'
|
||||
|
||||
tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"]))
|
||||
|
||||
|
||||
def test_ignore_leading_whitespace(all_parsers):
|
||||
# see gh-3374, gh-6607
|
||||
parser = all_parsers
|
||||
data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
|
||||
expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
|
||||
def test_uneven_lines_with_usecols(all_parsers, usecols):
|
||||
# see gh-12203
|
||||
parser = all_parsers
|
||||
data = r"""a,b,c
|
||||
0,1,2
|
||||
3,4,5,6,7
|
||||
8,9,10"""
|
||||
|
||||
if usecols is None:
|
||||
# Make sure that an error is still raised
|
||||
# when the "usecols" parameter is not provided.
|
||||
msg = r"Expected \d+ fields in line \d+, saw \d+"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
else:
|
||||
expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
# First, check to see that the response of parser when faced with no
|
||||
# provided columns raises the correct error, with or without usecols.
|
||||
("", {}, None),
|
||||
("", {"usecols": ["X"]}, None),
|
||||
(
|
||||
",,",
|
||||
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
|
||||
DataFrame(columns=["X"], index=[0], dtype=np.float64),
|
||||
),
|
||||
(
|
||||
"",
|
||||
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
|
||||
DataFrame(columns=["X"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
|
||||
# see gh-12493
|
||||
parser = all_parsers
|
||||
|
||||
if expected is None:
|
||||
msg = "No columns to parse from file"
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
# gh-8661, gh-8679: this should ignore six lines, including
|
||||
# lines with trailing whitespace and blank lines.
|
||||
(
|
||||
{
|
||||
"header": None,
|
||||
"delim_whitespace": True,
|
||||
"skiprows": [0, 1, 2, 3, 5, 6],
|
||||
"skip_blank_lines": True,
|
||||
},
|
||||
DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]),
|
||||
),
|
||||
# gh-8983: test skipping set of rows after a row with trailing spaces.
|
||||
(
|
||||
{
|
||||
"delim_whitespace": True,
|
||||
"skiprows": [1, 2, 3, 5, 6],
|
||||
"skip_blank_lines": True,
|
||||
},
|
||||
DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_trailing_spaces(all_parsers, kwargs, expected):
|
||||
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
|
||||
parser = all_parsers
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_raise_on_sep_with_delim_whitespace(all_parsers):
|
||||
# see gh-6607
|
||||
data = "a b c\n1 2 3"
|
||||
parser = all_parsers
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with pytest.raises(ValueError, match="you can only specify one"):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
|
||||
|
||||
|
||||
def test_read_filepath_or_buffer(all_parsers):
|
||||
# see gh-43366
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match="Expected file path name or file-like"):
|
||||
parser.read_csv(filepath_or_buffer=b"input")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("delim_whitespace", [True, False])
|
||||
def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
|
||||
# see gh-9710
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b\n"""
|
||||
|
||||
expected = DataFrame({"MyColumn": list("abab")})
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
skipinitialspace=True,
|
||||
delim_whitespace=delim_whitespace,
|
||||
)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sep,skip_blank_lines,exp_data",
|
||||
[
|
||||
(",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
|
||||
(r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
|
||||
(
|
||||
",",
|
||||
False,
|
||||
[
|
||||
[1.0, 2.0, 4.0],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[5.0, np.nan, 10.0],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[-70.0, 0.4, 1.0],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
|
||||
|
||||
5.,NaN,10.0
|
||||
|
||||
-70,.4,1
|
||||
"""
|
||||
|
||||
if sep == r"\s+":
|
||||
data = data.replace(",", " ")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines)
|
||||
expected = DataFrame(exp_data, columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_whitespace_lines(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """
|
||||
|
||||
\t \t\t
|
||||
\t
|
||||
A,B,C
|
||||
\t 1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
(
|
||||
""" A B C D
|
||||
a 1 2 3 4
|
||||
b 1 2 3 4
|
||||
c 1 2 3 4
|
||||
""",
|
||||
DataFrame(
|
||||
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=["a", "b", "c"],
|
||||
),
|
||||
),
|
||||
(
|
||||
" a b c\n1 2 3 \n4 5 6\n 7 8 9",
|
||||
DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_whitespace_regex_separator(all_parsers, data, expected):
|
||||
# see gh-6607
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_sub_character(all_parsers, csv_dir_path):
|
||||
# see gh-16893
|
||||
filename = os.path.join(csv_dir_path, "sub_char.csv")
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
|
||||
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(filename)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"])
|
||||
def test_filename_with_special_chars(all_parsers, filename):
|
||||
# see gh-15086.
|
||||
parser = all_parsers
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
|
||||
with tm.ensure_clean(filename) as path:
|
||||
df.to_csv(path, index=False)
|
||||
|
||||
result = parser.read_csv(path)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_read_table_same_signature_as_read_csv(all_parsers):
|
||||
# GH-34976
|
||||
parser = all_parsers
|
||||
|
||||
table_sign = signature(parser.read_table)
|
||||
csv_sign = signature(parser.read_csv)
|
||||
|
||||
assert table_sign.parameters.keys() == csv_sign.parameters.keys()
|
||||
assert table_sign.return_annotation == csv_sign.return_annotation
|
||||
|
||||
for key, csv_param in csv_sign.parameters.items():
|
||||
table_param = table_sign.parameters[key]
|
||||
if key == "sep":
|
||||
assert csv_param.default == ","
|
||||
assert table_param.default == "\t"
|
||||
assert table_param.annotation == csv_param.annotation
|
||||
assert table_param.kind == csv_param.kind
|
||||
continue
|
||||
|
||||
assert table_param == csv_param
|
||||
|
||||
|
||||
def test_read_table_equivalency_to_read_csv(all_parsers):
|
||||
# see gh-21948
|
||||
# As of 0.25.0, read_table is undeprecated
|
||||
parser = all_parsers
|
||||
data = "a\tb\n1\t2\n3\t4"
|
||||
expected = parser.read_csv(StringIO(data), sep="\t")
|
||||
result = parser.read_table(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("read_func", ["read_csv", "read_table"])
|
||||
def test_read_csv_and_table_sys_setprofile(all_parsers, read_func):
|
||||
# GH#41069
|
||||
parser = all_parsers
|
||||
data = "a b\n0 1"
|
||||
|
||||
sys.setprofile(lambda *a, **k: None)
|
||||
result = getattr(parser, read_func)(StringIO(data))
|
||||
sys.setprofile(None)
|
||||
|
||||
expected = DataFrame({"a b": ["0 1"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_first_row_bom(all_parsers):
|
||||
# see gh-26545
|
||||
parser = all_parsers
|
||||
data = '''\ufeff"Head1"\t"Head2"\t"Head3"'''
|
||||
|
||||
result = parser.read_csv(StringIO(data), delimiter="\t")
|
||||
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_first_row_bom_unquoted(all_parsers):
|
||||
# see gh-36343
|
||||
parser = all_parsers
|
||||
data = """\ufeffHead1\tHead2\tHead3"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), delimiter="\t")
|
||||
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", range(1, 6))
|
||||
def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
|
||||
# GH 28071
|
||||
ref = DataFrame(
|
||||
[[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
|
||||
columns=list("ab"),
|
||||
)
|
||||
csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False
|
||||
)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
|
||||
tm.assert_frame_equal(df, ref[:nrows])
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_no_header_two_extra_columns(all_parsers):
|
||||
# GH 26218
|
||||
column_names = ["one", "two", "three"]
|
||||
ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
|
||||
stream = StringIO("foo,bar,baz,bam,blah")
|
||||
parser = all_parsers
|
||||
df = parser.read_csv_check_warnings(
|
||||
ParserWarning,
|
||||
"Length of header or names does not match length of data. "
|
||||
"This leads to a loss of data with index_col=False.",
|
||||
stream,
|
||||
header=None,
|
||||
names=column_names,
|
||||
index_col=False,
|
||||
)
|
||||
tm.assert_frame_equal(df, ref)
|
||||
|
||||
|
||||
def test_read_csv_names_not_accepting_sets(all_parsers):
|
||||
# GH 34946
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6\n"""
|
||||
parser = all_parsers
|
||||
with pytest.raises(ValueError, match="Names should be an ordered collection."):
|
||||
parser.read_csv(StringIO(data), names=set("QAZ"))
|
||||
|
||||
|
||||
def test_read_table_delim_whitespace_default_sep(all_parsers):
|
||||
# GH: 35958
|
||||
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||||
parser = all_parsers
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_table(f, delim_whitespace=True)
|
||||
return
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_table(f, delim_whitespace=True)
|
||||
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("delimiter", [",", "\t"])
|
||||
def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
|
||||
# GH: 35958
|
||||
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Specified a delimiter with both sep and "
|
||||
"delim_whitespace=True; you can only specify one."
|
||||
)
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(f, delim_whitespace=True, sep=delimiter)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
|
||||
|
||||
|
||||
def test_read_csv_delimiter_and_sep_no_default(all_parsers):
|
||||
# GH#39823
|
||||
f = StringIO("a,b\n1,2")
|
||||
parser = all_parsers
|
||||
msg = "Specified a sep and a delimiter; you can only specify one."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(f, sep=" ", delimiter=".")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{"delimiter": "\n"}, {"sep": "\n"}])
|
||||
def test_read_csv_line_break_as_separator(kwargs, all_parsers):
|
||||
# GH#43528
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,2,3
|
||||
"""
|
||||
msg = (
|
||||
r"Specified \\n as separator or delimiter. This forces the python engine "
|
||||
r"which does not accept a line terminator. Hence it is not allowed to use "
|
||||
r"the line terminator as separator."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("delimiter", [",", "\t"])
|
||||
def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
|
||||
# GH: 35958
|
||||
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Specified a delimiter with both sep and "
|
||||
"delim_whitespace=True; you can only specify one."
|
||||
)
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_table(f, delim_whitespace=True, sep=delimiter)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_dict_keys_as_names(all_parsers):
|
||||
# GH: 36928
|
||||
data = "1,2"
|
||||
|
||||
keys = {"a": int, "b": int}.keys()
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=keys)
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
|
||||
@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0
|
||||
def test_encoding_surrogatepass(all_parsers):
|
||||
# GH39017
|
||||
parser = all_parsers
|
||||
content = b"\xed\xbd\xbf"
|
||||
decoded = content.decode("utf-8", errors="surrogatepass")
|
||||
expected = DataFrame({decoded: [decoded]}, index=[decoded * 2])
|
||||
expected.index.name = decoded * 2
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
Path(path).write_bytes(
|
||||
content * 2 + b"," + content + b"\n" + content * 2 + b"," + content
|
||||
)
|
||||
df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"):
|
||||
parser.read_csv(path)
|
||||
|
||||
|
||||
def test_malformed_second_line(all_parsers):
|
||||
# see GH14782
|
||||
parser = all_parsers
|
||||
data = "\na\nb\n"
|
||||
result = parser.read_csv(StringIO(data), skip_blank_lines=False, header=1)
|
||||
expected = DataFrame({"a": ["b"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_short_single_line(all_parsers):
|
||||
# GH 47566
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
data = "1,2"
|
||||
result = parser.read_csv(StringIO(data), header=None, names=columns)
|
||||
expected = DataFrame({"a": [1], "b": [2], "c": [np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Length mismatch: Expected axis has 2 elements
|
||||
def test_short_multi_line(all_parsers):
|
||||
# GH 47566
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
data = "1,2\n1,2"
|
||||
result = parser.read_csv(StringIO(data), header=None, names=columns)
|
||||
expected = DataFrame({"a": [1, 1], "b": [2, 2], "c": [np.nan, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_seek(all_parsers):
|
||||
# GH48646
|
||||
parser = all_parsers
|
||||
prefix = "### DATA\n"
|
||||
content = "nkey,value\ntables,rectangular\n"
|
||||
with tm.ensure_clean() as path:
|
||||
Path(path).write_text(prefix + content, encoding="utf-8")
|
||||
with open(path, encoding="utf-8") as file:
|
||||
file.readline()
|
||||
actual = parser.read_csv(file)
|
||||
expected = parser.read_csv(StringIO(content))
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
@@ -0,0 +1,91 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import TextParser
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@xfail_pyarrow
|
||||
def test_read_data_list(all_parsers):
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
|
||||
|
||||
data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
with TextParser(data_list, chunksize=2, **kwargs) as parser:
|
||||
result = parser.read()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_reader_list(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
|
||||
lines = list(csv.reader(StringIO(data)))
|
||||
with TextParser(lines, chunksize=2, **kwargs) as reader:
|
||||
chunks = list(reader)
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
tm.assert_frame_equal(chunks[0], expected[:2])
|
||||
tm.assert_frame_equal(chunks[1], expected[2:4])
|
||||
tm.assert_frame_equal(chunks[2], expected[4:])
|
||||
|
||||
|
||||
def test_reader_list_skiprows(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
|
||||
lines = list(csv.reader(StringIO(data)))
|
||||
with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader:
|
||||
chunks = list(reader)
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
tm.assert_frame_equal(chunks[0], expected[1:3])
|
||||
|
||||
|
||||
def test_read_csv_parse_simple_list(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """foo
|
||||
bar baz
|
||||
qux foo
|
||||
foo
|
||||
bar"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,72 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,thousands,decimal",
|
||||
[
|
||||
(
|
||||
"""A|B|C
|
||||
1|2,334.01|5
|
||||
10|13|10.
|
||||
""",
|
||||
",",
|
||||
".",
|
||||
),
|
||||
(
|
||||
"""A|B|C
|
||||
1|2.334,01|5
|
||||
10|13|10,
|
||||
""",
|
||||
".",
|
||||
",",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), sep="|", thousands=thousands, decimal=decimal
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), sep="|", thousands=thousands, decimal=decimal
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_euro_decimal_format(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||||
1;1521,1541;187101,9543;ABC;poi;4,738797819
|
||||
2;121,12;14897,76;DEF;uyt;0,377320872
|
||||
3;878,158;108013,434;GHI;rez;2,735694704"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=";", decimal=",")
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
|
||||
[2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
|
||||
[3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
|
||||
],
|
||||
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,478 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
)
|
||||
import os
|
||||
import platform
|
||||
from urllib.error import URLError
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
)
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.single_cpu
|
||||
def test_url(all_parsers, csv_dir_path, httpserver):
|
||||
parser = all_parsers
|
||||
kwargs = {"sep": "\t"}
|
||||
|
||||
local_path = os.path.join(csv_dir_path, "salaries.csv")
|
||||
with open(local_path, encoding="utf-8") as f:
|
||||
httpserver.serve_content(content=f.read())
|
||||
|
||||
url_result = parser.read_csv(httpserver.url, **kwargs)
|
||||
|
||||
local_result = parser.read_csv(local_path, **kwargs)
|
||||
tm.assert_frame_equal(url_result, local_result)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_local_file(all_parsers, csv_dir_path):
|
||||
parser = all_parsers
|
||||
kwargs = {"sep": "\t"}
|
||||
|
||||
local_path = os.path.join(csv_dir_path, "salaries.csv")
|
||||
local_result = parser.read_csv(local_path, **kwargs)
|
||||
url = "file://localhost/" + local_path
|
||||
|
||||
try:
|
||||
url_result = parser.read_csv(url, **kwargs)
|
||||
tm.assert_frame_equal(url_result, local_result)
|
||||
except URLError:
|
||||
# Fails on some systems.
|
||||
pytest.skip("Failing on: " + " ".join(platform.uname()))
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
def test_path_path_lib(all_parsers):
|
||||
parser = all_parsers
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
def test_path_local_path(all_parsers):
|
||||
parser = all_parsers
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
result = tm.round_trip_localpath(
|
||||
df.to_csv, lambda p: parser.read_csv(p, index_col=0)
|
||||
)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
def test_nonexistent_path(all_parsers):
|
||||
# gh-2428: pls no segfault
|
||||
# gh-14086: raise more helpful FileNotFoundError
|
||||
# GH#29233 "File foo" instead of "File b'foo'"
|
||||
parser = all_parsers
|
||||
path = f"{uuid.uuid4()}.csv"
|
||||
|
||||
msg = r"\[Errno 2\]"
|
||||
with pytest.raises(FileNotFoundError, match=msg) as e:
|
||||
parser.read_csv(path)
|
||||
assert path == e.value.filename
|
||||
|
||||
|
||||
@td.skip_if_windows # os.chmod does not work in windows
|
||||
def test_no_permission(all_parsers):
|
||||
# GH 23784
|
||||
parser = all_parsers
|
||||
|
||||
msg = r"\[Errno 13\]"
|
||||
with tm.ensure_clean() as path:
|
||||
os.chmod(path, 0) # make file unreadable
|
||||
|
||||
# verify that this process cannot open the file (not running as sudo)
|
||||
try:
|
||||
with open(path, encoding="utf-8"):
|
||||
pass
|
||||
pytest.skip("Running as sudo.")
|
||||
except PermissionError:
|
||||
pass
|
||||
|
||||
with pytest.raises(PermissionError, match=msg) as e:
|
||||
parser.read_csv(path)
|
||||
assert path == e.value.filename
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected,msg",
|
||||
[
|
||||
# gh-10728: WHITESPACE_LINE
|
||||
(
|
||||
"a,b,c\n4,5,6\n ",
|
||||
{},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# gh-10548: EAT_LINE_COMMENT
|
||||
(
|
||||
"a,b,c\n4,5,6\n#comment",
|
||||
{"comment": "#"},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_CRNL_NOP
|
||||
(
|
||||
"a,b,c\n4,5,6\n\r",
|
||||
{},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_COMMENT
|
||||
(
|
||||
"a,b,c\n4,5,6#comment",
|
||||
{"comment": "#"},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# SKIP_LINE
|
||||
(
|
||||
"a,b,c\n4,5,6\nskipme",
|
||||
{"skiprows": [2]},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_LINE_COMMENT
|
||||
(
|
||||
"a,b,c\n4,5,6\n#comment",
|
||||
{"comment": "#", "skip_blank_lines": False},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# IN_FIELD
|
||||
(
|
||||
"a,b,c\n4,5,6\n ",
|
||||
{"skip_blank_lines": False},
|
||||
DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_CRNL
|
||||
(
|
||||
"a,b,c\n4,5,6\n\r",
|
||||
{"skip_blank_lines": False},
|
||||
DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# ESCAPED_CHAR
|
||||
(
|
||||
"a,b,c\n4,5,6\n\\",
|
||||
{"escapechar": "\\"},
|
||||
None,
|
||||
"(EOF following escape character)|(unexpected end of data)",
|
||||
),
|
||||
# ESCAPE_IN_QUOTED_FIELD
|
||||
(
|
||||
'a,b,c\n4,5,6\n"\\',
|
||||
{"escapechar": "\\"},
|
||||
None,
|
||||
"(EOF inside string starting at row 2)|(unexpected end of data)",
|
||||
),
|
||||
# IN_QUOTED_FIELD
|
||||
(
|
||||
'a,b,c\n4,5,6\n"',
|
||||
{"escapechar": "\\"},
|
||||
None,
|
||||
"(EOF inside string starting at row 2)|(unexpected end of data)",
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"whitespace-line",
|
||||
"eat-line-comment",
|
||||
"eat-crnl-nop",
|
||||
"eat-comment",
|
||||
"skip-line",
|
||||
"eat-line-comment",
|
||||
"in-field",
|
||||
"eat-crnl",
|
||||
"escaped-char",
|
||||
"escape-in-quoted-field",
|
||||
"in-quoted-field",
|
||||
],
|
||||
)
|
||||
def test_eof_states(all_parsers, data, kwargs, expected, msg, request):
|
||||
# see gh-10728, gh-10548
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and "comment" in kwargs:
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
|
||||
if parser.engine == "pyarrow" and "\r" not in data:
|
||||
# pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1:
|
||||
# ValueError: skiprows argument must be an integer when using engine='pyarrow'
|
||||
# AssertionError: Regex pattern did not match.
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
if expected is None:
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_temporary_file(all_parsers):
|
||||
# see gh-13398
|
||||
parser = all_parsers
|
||||
data = "0 0"
|
||||
|
||||
with tm.ensure_clean(mode="w+", return_filelike=True) as new_file:
|
||||
new_file.write(data)
|
||||
new_file.flush()
|
||||
new_file.seek(0)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(new_file, sep=r"\s+", header=None)
|
||||
return
|
||||
|
||||
result = parser.read_csv(new_file, sep=r"\s+", header=None)
|
||||
|
||||
expected = DataFrame([[0, 0]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_eof_byte(all_parsers):
|
||||
# see gh-5500
|
||||
parser = all_parsers
|
||||
data = "a,b\n1\x1a,2"
|
||||
|
||||
expected = DataFrame([["1\x1a", 2]], columns=["a", "b"])
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_eof_byte_to_file(all_parsers):
|
||||
# see gh-16559
|
||||
parser = all_parsers
|
||||
data = b'c1,c2\r\n"test \x1a test", test\r\n'
|
||||
expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"])
|
||||
path = f"__{uuid.uuid4()}__.csv"
|
||||
|
||||
with tm.ensure_clean(path) as path:
|
||||
with open(path, "wb") as f:
|
||||
f.write(data)
|
||||
|
||||
result = parser.read_csv(path)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_file_handle_string_io(all_parsers):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = all_parsers
|
||||
data = "a,b\n1,2"
|
||||
|
||||
fh = StringIO(data)
|
||||
parser.read_csv(fh)
|
||||
assert not fh.closed
|
||||
|
||||
|
||||
def test_file_handles_with_open(all_parsers, csv1):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = all_parsers
|
||||
|
||||
for mode in ["r", "rb"]:
|
||||
with open(csv1, mode, encoding="utf-8" if mode == "r" else None) as f:
|
||||
parser.read_csv(f)
|
||||
assert not f.closed
|
||||
|
||||
|
||||
def test_invalid_file_buffer_class(all_parsers):
|
||||
# see gh-15337
|
||||
class InvalidBuffer:
|
||||
pass
|
||||
|
||||
parser = all_parsers
|
||||
msg = "Invalid file path or buffer object type"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(InvalidBuffer())
|
||||
|
||||
|
||||
def test_invalid_file_buffer_mock(all_parsers):
|
||||
# see gh-15337
|
||||
parser = all_parsers
|
||||
msg = "Invalid file path or buffer object type"
|
||||
|
||||
class Foo:
|
||||
pass
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(Foo())
|
||||
|
||||
|
||||
def test_valid_file_buffer_seems_invalid(all_parsers):
|
||||
# gh-16135: we want to ensure that "tell" and "seek"
|
||||
# aren't actually being used when we call `read_csv`
|
||||
#
|
||||
# Thus, while the object may look "invalid" (these
|
||||
# methods are attributes of the `StringIO` class),
|
||||
# it is still a valid file-object for our purposes.
|
||||
class NoSeekTellBuffer(StringIO):
|
||||
def tell(self):
|
||||
raise AttributeError("No tell method")
|
||||
|
||||
def seek(self, pos, whence=0):
|
||||
raise AttributeError("No seek method")
|
||||
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(NoSeekTellBuffer(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("io_class", [StringIO, BytesIO])
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_read_csv_file_handle(all_parsers, io_class, encoding):
|
||||
"""
|
||||
Test whether read_csv does not close user-provided file handles.
|
||||
|
||||
GH 36980
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
|
||||
content = "a,b\n1,2"
|
||||
handle = io_class(content.encode("utf-8") if io_class == BytesIO else content)
|
||||
|
||||
tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected)
|
||||
assert not handle.closed
|
||||
|
||||
|
||||
def test_memory_map_compression(all_parsers, compression):
|
||||
"""
|
||||
Support memory map for compressed files.
|
||||
|
||||
GH 37621
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
expected.to_csv(path, index=False, compression=compression)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(path, memory_map=True, compression=compression)
|
||||
return
|
||||
|
||||
result = parser.read_csv(path, memory_map=True, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(
|
||||
result,
|
||||
expected,
|
||||
)
|
||||
|
||||
|
||||
def test_context_manager(all_parsers, datapath):
|
||||
# make sure that opened files are closed
|
||||
parser = all_parsers
|
||||
|
||||
path = datapath("io", "data", "csv", "iris.csv")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(path, chunksize=1)
|
||||
return
|
||||
|
||||
reader = parser.read_csv(path, chunksize=1)
|
||||
assert not reader.handles.handle.closed
|
||||
try:
|
||||
with reader:
|
||||
next(reader)
|
||||
assert False
|
||||
except AssertionError:
|
||||
assert reader.handles.handle.closed
|
||||
|
||||
|
||||
def test_context_manageri_user_provided(all_parsers, datapath):
|
||||
# make sure that user-provided handles are not closed
|
||||
parser = all_parsers
|
||||
|
||||
with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path:
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(path, chunksize=1)
|
||||
return
|
||||
|
||||
reader = parser.read_csv(path, chunksize=1)
|
||||
assert not reader.handles.handle.closed
|
||||
try:
|
||||
with reader:
|
||||
next(reader)
|
||||
assert False
|
||||
except AssertionError:
|
||||
assert not reader.handles.handle.closed
|
||||
|
||||
|
||||
@skip_pyarrow # ParserError: Empty CSV file
|
||||
def test_file_descriptor_leak(all_parsers, using_copy_on_write):
|
||||
# GH 31488
|
||||
parser = all_parsers
|
||||
with tm.ensure_clean() as path:
|
||||
with pytest.raises(EmptyDataError, match="No columns to parse from file"):
|
||||
parser.read_csv(path)
|
||||
|
||||
|
||||
def test_memory_map(all_parsers, csv_dir_path):
|
||||
mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
|
||||
)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(mmap_file, memory_map=True)
|
||||
return
|
||||
|
||||
result = parser.read_csv(mmap_file, memory_map=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,79 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_linux
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block
|
||||
def test_float_parser(all_parsers):
|
||||
# see gh-9565
|
||||
parser = all_parsers
|
||||
data = "45e-1,4.5,45.,inf,-inf"
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
|
||||
expected = DataFrame([[float(s) for s in data.split(",")]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_scientific_no_exponent(all_parsers_all_precisions):
|
||||
# see gh-12215
|
||||
df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
|
||||
data = df.to_csv(index=False)
|
||||
parser, precision = all_parsers_all_precisions
|
||||
|
||||
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
|
||||
tm.assert_frame_equal(df_roundtrip, df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"neg_exp",
|
||||
[
|
||||
-617,
|
||||
-100000,
|
||||
pytest.param(-99999999999999999, marks=pytest.mark.skip_ubsan),
|
||||
],
|
||||
)
|
||||
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
|
||||
# GH#38753
|
||||
parser, precision = all_parsers_all_precisions
|
||||
|
||||
data = f"data\n10E{neg_exp}"
|
||||
result = parser.read_csv(StringIO(data), float_precision=precision)
|
||||
expected = DataFrame({"data": [0.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.skip_ubsan
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
|
||||
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
|
||||
# GH#38753
|
||||
parser, precision = all_parsers_all_precisions
|
||||
data = f"data\n10E{exp}"
|
||||
result = parser.read_csv(StringIO(data), float_precision=precision)
|
||||
if precision == "round_trip":
|
||||
if exp == 999999999999999999 and is_platform_linux():
|
||||
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
|
||||
request.applymarker(mark)
|
||||
|
||||
value = np.inf if exp > 0 else 0.0
|
||||
expected = DataFrame({"data": [value]})
|
||||
else:
|
||||
expected = DataFrame({"data": [f"10E{exp}"]})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,304 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
"""foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
""",
|
||||
{"index_col": 0, "names": ["index", "A", "B", "C", "D"]},
|
||||
DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),
|
||||
columns=["A", "B", "C", "D"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"""foo,one,2,3,4,5
|
||||
foo,two,7,8,9,10
|
||||
foo,three,12,13,14,15
|
||||
bar,one,12,13,14,15
|
||||
bar,two,12,13,14,15
|
||||
""",
|
||||
{"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]},
|
||||
DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
("foo", "one"),
|
||||
("foo", "two"),
|
||||
("foo", "three"),
|
||||
("bar", "one"),
|
||||
("bar", "two"),
|
||||
],
|
||||
names=["index1", "index2"],
|
||||
),
|
||||
columns=["A", "B", "C", "D"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_pass_names_with_index(all_parsers, data, kwargs, expected):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
|
||||
def test_multi_index_no_level_names(
|
||||
request, all_parsers, index_col, using_infer_string
|
||||
):
|
||||
data = """index1,index2,A,B,C,D
|
||||
foo,one,2,3,4,5
|
||||
foo,two,7,8,9,10
|
||||
foo,three,12,13,14,15
|
||||
bar,one,12,13,14,15
|
||||
bar,two,12,13,14,15
|
||||
"""
|
||||
headless_data = "\n".join(data.split("\n")[1:])
|
||||
|
||||
names = ["A", "B", "C", "D"]
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(headless_data), index_col=index_col, header=None, names=names
|
||||
)
|
||||
expected = parser.read_csv(StringIO(data), index_col=index_col)
|
||||
|
||||
# No index names in headless data.
|
||||
expected.index.names = [None] * 2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_multi_index_no_level_names_implicit(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
foo,one,2,3,4,5
|
||||
foo,two,7,8,9,10
|
||||
foo,three,12,13,14,15
|
||||
bar,one,12,13,14,15
|
||||
bar,two,12,13,14,15
|
||||
"""
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
("foo", "one"),
|
||||
("foo", "two"),
|
||||
("foo", "three"),
|
||||
("bar", "one"),
|
||||
("bar", "two"),
|
||||
]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected,header",
|
||||
[
|
||||
("a,b", DataFrame(columns=["a", "b"]), [0]),
|
||||
(
|
||||
"a,b\nc,d",
|
||||
DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),
|
||||
[0, 1],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("round_trip", [True, False])
|
||||
def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
|
||||
# see gh-14545
|
||||
parser = all_parsers
|
||||
data = expected.to_csv(index=False) if round_trip else data
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.columns are different
|
||||
def test_no_unnamed_index(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """ id c0 c1 c2
|
||||
0 1 0 a b
|
||||
1 2 0 c d
|
||||
2 2 2 e f
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), sep=" ")
|
||||
expected = DataFrame(
|
||||
[[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],
|
||||
columns=["Unnamed: 0", "id", "c0", "c1", "c2"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_duplicate_index_explicit(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo,12,13,14,15
|
||||
bar,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=0)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_read_duplicate_index_implicit(all_parsers):
|
||||
data = """A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo,12,13,14,15
|
||||
bar,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_read_csv_no_index_name(all_parsers, csv_dir_path):
|
||||
parser = all_parsers
|
||||
csv2 = os.path.join(csv_dir_path, "test2.csv")
|
||||
result = parser.read_csv(csv2, index_col=0, parse_dates=True)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],
|
||||
[1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],
|
||||
[0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],
|
||||
[1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],
|
||||
[-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],
|
||||
],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
index=Index(
|
||||
[
|
||||
datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 4),
|
||||
datetime(2000, 1, 5),
|
||||
datetime(2000, 1, 6),
|
||||
datetime(2000, 1, 7),
|
||||
]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_empty_with_index(all_parsers):
|
||||
# see gh-10184
|
||||
data = "x,y"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=0)
|
||||
|
||||
expected = DataFrame(columns=["y"], index=Index([], name="x"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# CSV parse error: Empty CSV file or block: cannot infer number of columns
|
||||
@skip_pyarrow
|
||||
def test_empty_with_multi_index(all_parsers):
|
||||
# see gh-10467
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=["x", "y"])
|
||||
|
||||
expected = DataFrame(
|
||||
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# CSV parse error: Empty CSV file or block: cannot infer number of columns
|
||||
@skip_pyarrow
|
||||
def test_empty_with_reversed_multi_index(all_parsers):
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=[1, 0])
|
||||
|
||||
expected = DataFrame(
|
||||
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,78 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_inf_parsing(all_parsers, na_filter):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
,A
|
||||
a,inf
|
||||
b,-inf
|
||||
c,+Inf
|
||||
d,-Inf
|
||||
e,INF
|
||||
f,-INF
|
||||
g,+INf
|
||||
h,-INf
|
||||
i,inF
|
||||
j,-inF"""
|
||||
expected = DataFrame(
|
||||
{"A": [float("inf"), float("-inf")] * 5},
|
||||
index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_infinity_parsing(all_parsers, na_filter):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
,A
|
||||
a,Infinity
|
||||
b,-Infinity
|
||||
c,+Infinity
|
||||
"""
|
||||
expected = DataFrame(
|
||||
{"A": [float("infinity"), float("-infinity"), float("+infinity")]},
|
||||
index=["a", "b", "c"],
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_with_use_inf_as_na(all_parsers):
|
||||
# https://github.com/pandas-dev/pandas/issues/35493
|
||||
parser = all_parsers
|
||||
data = "1.0\nNaN\n3.0"
|
||||
msg = "use_inf_as_na option is deprecated"
|
||||
warn = FutureWarning
|
||||
if parser.engine == "pyarrow":
|
||||
warn = (FutureWarning, DeprecationWarning)
|
||||
|
||||
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
|
||||
with option_context("use_inf_as_na", True):
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = DataFrame([1.0, np.nan, 3.0])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,231 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_int_conversion(all_parsers):
|
||||
data = """A,B
|
||||
1.0,1
|
||||
2.0,2
|
||||
3.0,3
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
"A,B\nTrue,1\nFalse,2\nTrue,3",
|
||||
{},
|
||||
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
|
||||
),
|
||||
(
|
||||
"A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
|
||||
{"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]},
|
||||
DataFrame(
|
||||
[[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]],
|
||||
columns=["A", "B"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"A,B\nTRUE,1\nFALSE,2\nTRUE,3",
|
||||
{},
|
||||
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
|
||||
),
|
||||
(
|
||||
"A,B\nfoo,bar\nbar,foo",
|
||||
{"true_values": ["foo"], "false_values": ["bar"]},
|
||||
DataFrame([[True, False], [False, True]], columns=["A", "B"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_parse_bool(all_parsers, data, kwargs, expected):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_integers_above_fp_precision(all_parsers):
|
||||
data = """Numbers
|
||||
17007000002000191
|
||||
17007000002000191
|
||||
17007000002000191
|
||||
17007000002000191
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000194"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Numbers": [
|
||||
17007000002000191,
|
||||
17007000002000191,
|
||||
17007000002000191,
|
||||
17007000002000191,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000194,
|
||||
]
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sep", [" ", r"\s+"])
|
||||
def test_integer_overflow_bug(all_parsers, sep):
|
||||
# see gh-2601
|
||||
data = "65248E10 11\n55555E55 22\n"
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow" and sep != " ":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=None, sep=sep)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, sep=sep)
|
||||
expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_int64_min_issues(all_parsers):
|
||||
# see gh-2599
|
||||
parser = all_parsers
|
||||
data = "A,B\n0,0\n0,"
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
|
||||
def test_int64_overflow(all_parsers, conv, request):
|
||||
data = """ID
|
||||
00013007854817840016671868
|
||||
00013007854817840016749251
|
||||
00013007854817840016754630
|
||||
00013007854817840016781876
|
||||
00013007854817840017028824
|
||||
00013007854817840017963235
|
||||
00013007854817840018860166"""
|
||||
parser = all_parsers
|
||||
|
||||
if conv is None:
|
||||
# 13007854817840016671868 > UINT64_MAX, so this
|
||||
# will overflow and return object as the dtype.
|
||||
if parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="parses to float64")
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
"00013007854817840016671868",
|
||||
"00013007854817840016749251",
|
||||
"00013007854817840016754630",
|
||||
"00013007854817840016781876",
|
||||
"00013007854817840017028824",
|
||||
"00013007854817840017963235",
|
||||
"00013007854817840018860166",
|
||||
],
|
||||
columns=["ID"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
# 13007854817840016671868 > UINT64_MAX, so attempts
|
||||
# to cast to either int64 or uint64 will result in
|
||||
# an OverflowError being raised.
|
||||
msg = "|".join(
|
||||
[
|
||||
"Python int too large to convert to C long",
|
||||
"long too big to convert",
|
||||
"int too big to convert",
|
||||
]
|
||||
)
|
||||
err = OverflowError
|
||||
if parser.engine == "pyarrow":
|
||||
err = ValueError
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), converters={"ID": conv})
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize(
|
||||
"val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
|
||||
)
|
||||
def test_int64_uint64_range(all_parsers, val):
|
||||
# These numbers fall right inside the int64-uint64
|
||||
# range, so they should be parsed as string.
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(str(val)), header=None)
|
||||
|
||||
expected = DataFrame([val])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize(
|
||||
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
|
||||
)
|
||||
def test_outside_int64_uint64_range(all_parsers, val):
|
||||
# These numbers fall just outside the int64-uint64
|
||||
# range, so they should be parsed as string.
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(str(val)), header=None)
|
||||
|
||||
expected = DataFrame([str(val)])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # gets float64 dtype instead of object
|
||||
@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]])
|
||||
def test_numeric_range_too_wide(all_parsers, exp_data):
|
||||
# No numerical dtype can hold both negative and uint64
|
||||
# values, so they should be cast as string.
|
||||
parser = all_parsers
|
||||
data = "\n".join(exp_data)
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_integer_precision(all_parsers):
|
||||
# Gh 7072
|
||||
s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
|
||||
5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(s), header=None)[4]
|
||||
expected = Series([4321583677327450765, 4321113141090630389], name=4)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,134 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
def test_iterator(all_parsers):
|
||||
# see gh-6607
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), iterator=True, **kwargs)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader:
|
||||
first_chunk = reader.read(3)
|
||||
tm.assert_frame_equal(first_chunk, expected[:3])
|
||||
|
||||
last_chunk = reader.read(5)
|
||||
tm.assert_frame_equal(last_chunk, expected[3:])
|
||||
|
||||
|
||||
def test_iterator2(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), iterator=True)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), iterator=True) as reader:
|
||||
result = list(reader)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result[0], expected)
|
||||
|
||||
|
||||
def test_iterator_stop_on_chunksize(all_parsers):
|
||||
# gh-3967: stopping iteration when chunksize is specified
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), chunksize=1)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), chunksize=1) as reader:
|
||||
result = list(reader)
|
||||
|
||||
assert len(result) == 3
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(concat(result), expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}]
|
||||
)
|
||||
def test_iterator_skipfooter_errors(all_parsers, kwargs):
|
||||
msg = "'skipfooter' not supported for iteration"
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"The '(chunksize|iterator)' option is not supported with the "
|
||||
"'pyarrow' engine"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _:
|
||||
pass
|
||||
|
||||
|
||||
def test_iteration_open_handle(all_parsers):
|
||||
parser = all_parsers
|
||||
kwargs = {"header": None}
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
|
||||
|
||||
with open(path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if "CCC" in line:
|
||||
break
|
||||
|
||||
result = parser.read_csv(f, **kwargs)
|
||||
expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,320 @@
|
||||
"""
|
||||
Tests that work on the Python, C and PyArrow engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
import codecs
|
||||
import csv
|
||||
from io import StringIO
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY311
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_empty_decimal_marker(all_parsers):
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
# Parsers support only length-1 decimals
|
||||
msg = "Only length-1 decimal markers supported"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"only single character unicode strings can be "
|
||||
"converted to Py_UCS4, got length 0"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), decimal="")
|
||||
|
||||
|
||||
def test_bad_stream_exception(all_parsers, csv_dir_path):
|
||||
# see gh-13652
|
||||
#
|
||||
# This test validates that both the Python engine and C engine will
|
||||
# raise UnicodeDecodeError instead of C engine raising ParserError
|
||||
# and swallowing the exception that caused read to fail.
|
||||
path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv")
|
||||
codec = codecs.lookup("utf-8")
|
||||
utf8 = codecs.lookup("utf-8")
|
||||
parser = all_parsers
|
||||
msg = "'utf-8' codec can't decode byte"
|
||||
|
||||
# Stream must be binary UTF8.
|
||||
with open(path, "rb") as handle, codecs.StreamRecoder(
|
||||
handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter
|
||||
) as stream:
|
||||
with pytest.raises(UnicodeDecodeError, match=msg):
|
||||
parser.read_csv(stream)
|
||||
|
||||
|
||||
def test_malformed(all_parsers):
|
||||
# see gh-6607
|
||||
parser = all_parsers
|
||||
data = """ignore
|
||||
A,B,C
|
||||
1,2,3 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
"""
|
||||
msg = "Expected 3 fields in line 4, saw 5"
|
||||
err = ParserError
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
err = ValueError
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), header=1, comment="#")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [5, 3, None])
|
||||
def test_malformed_chunks(all_parsers, nrows):
|
||||
data = """ignore
|
||||
A,B,C
|
||||
skip
|
||||
1,2,3
|
||||
3,5,10 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
"""
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=1,
|
||||
comment="#",
|
||||
iterator=True,
|
||||
chunksize=1,
|
||||
skiprows=[2],
|
||||
)
|
||||
return
|
||||
|
||||
msg = "Expected 3 fields in line 6, saw 5"
|
||||
with parser.read_csv(
|
||||
StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2]
|
||||
) as reader:
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
reader.read(nrows)
|
||||
|
||||
|
||||
@xfail_pyarrow # does not raise
|
||||
def test_catch_too_many_names(all_parsers):
|
||||
# see gh-5156
|
||||
data = """\
|
||||
1,2,3
|
||||
4,,6
|
||||
7,8,9
|
||||
10,11,12\n"""
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Too many columns specified: expected 4 and found 3"
|
||||
if parser.engine == "c"
|
||||
else "Number of passed names did not match "
|
||||
"number of header fields in the file"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
|
||||
def test_raise_on_no_columns(all_parsers, nrows):
|
||||
parser = all_parsers
|
||||
data = "\n" * nrows
|
||||
|
||||
msg = "No columns to parse from file"
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
|
||||
|
||||
def test_unexpected_keyword_parameter_exception(all_parsers):
|
||||
# GH-34976
|
||||
parser = all_parsers
|
||||
|
||||
msg = "{}\\(\\) got an unexpected keyword argument 'foo'"
|
||||
with pytest.raises(TypeError, match=msg.format("read_csv")):
|
||||
parser.read_csv("foo.csv", foo=1)
|
||||
with pytest.raises(TypeError, match=msg.format("read_table")):
|
||||
parser.read_table("foo.tsv", foo=1)
|
||||
|
||||
|
||||
def test_suppress_error_output(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
expected = DataFrame({"a": [1, 4]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), on_bad_lines="skip")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_error_bad_lines(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
|
||||
msg = "Expected 1 fields in line 3, saw 3"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
# "CSV parse error: Expected 1 columns, got 3: 1,2,3"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), on_bad_lines="error")
|
||||
|
||||
|
||||
def test_warn_bad_lines(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
expected = DataFrame({"a": [1, 4]})
|
||||
match_msg = "Skipping line"
|
||||
|
||||
expected_warning = ParserWarning
|
||||
if parser.engine == "pyarrow":
|
||||
match_msg = "Expected 1 columns, but found 3: 1,2,3"
|
||||
expected_warning = (ParserWarning, DeprecationWarning)
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
expected_warning, match=match_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_wrong_num_columns(all_parsers):
|
||||
# Too few columns.
|
||||
data = """A,B,C,D,E,F
|
||||
1,2,3,4,5,6
|
||||
6,7,8,9,10,11,12
|
||||
11,12,13,14,15,16
|
||||
"""
|
||||
parser = all_parsers
|
||||
msg = "Expected 6 fields in line 3, saw 7"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
# Expected 6 columns, got 7: 6,7,8,9,10,11,12
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
|
||||
|
||||
def test_null_byte_char(request, all_parsers):
|
||||
# see gh-2741
|
||||
data = "\x00,foo"
|
||||
names = ["a", "b"]
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "c" or (parser.engine == "python" and PY311):
|
||||
if parser.engine == "python" and PY311:
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="In Python 3.11, this is read as an empty character not null"
|
||||
)
|
||||
)
|
||||
expected = DataFrame([[np.nan, "foo"]], columns=names)
|
||||
out = parser.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
else:
|
||||
if parser.engine == "pyarrow":
|
||||
# CSV parse error: Empty CSV file or block: "
|
||||
# cannot infer number of columns"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
else:
|
||||
msg = "NULL byte detected"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("always::ResourceWarning")
|
||||
def test_open_file(request, all_parsers):
|
||||
# GH 39024
|
||||
parser = all_parsers
|
||||
|
||||
msg = "Could not determine delimiter"
|
||||
err = csv.Error
|
||||
if parser.engine == "c":
|
||||
msg = "the 'c' engine does not support sep=None with delim_whitespace=False"
|
||||
err = ValueError
|
||||
elif parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"the 'pyarrow' engine does not support sep=None with delim_whitespace=False"
|
||||
)
|
||||
err = ValueError
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
file = Path(path)
|
||||
file.write_bytes(b"\xe4\na\n1")
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
# should not trigger a ResourceWarning
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(file, sep=None, encoding_errors="replace")
|
||||
|
||||
|
||||
def test_invalid_on_bad_line(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"):
|
||||
parser.read_csv(StringIO(data), on_bad_lines="abc")
|
||||
|
||||
|
||||
def test_bad_header_uniform_error(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n"
|
||||
msg = "Expected 2 fields in line 2, saw 4"
|
||||
if parser.engine == "c":
|
||||
msg = (
|
||||
"Could not construct index. Requested to use 1 "
|
||||
"number of columns, but 3 left to parse."
|
||||
)
|
||||
elif parser.engine == "pyarrow":
|
||||
# "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error")
|
||||
|
||||
|
||||
def test_on_bad_lines_warn_correct_formatting(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = """1,2
|
||||
a,b
|
||||
a,b,c
|
||||
a,b,d
|
||||
a,b
|
||||
"""
|
||||
expected = DataFrame({"1": "a", "2": ["b"] * 2})
|
||||
match_msg = "Skipping line"
|
||||
|
||||
expected_warning = ParserWarning
|
||||
if parser.engine == "pyarrow":
|
||||
match_msg = "Expected 2 columns, but found 3: a,b,c"
|
||||
expected_warning = (ParserWarning, DeprecationWarning)
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
expected_warning, match=match_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,81 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated"
|
||||
|
||||
|
||||
def test_verbose_read(all_parsers, capsys):
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
one,1,2,3
|
||||
one,1,2,3
|
||||
,1,2,3
|
||||
one,1,2,3
|
||||
,1,2,3
|
||||
,1,2,3
|
||||
one,1,2,3
|
||||
two,1,2,3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'verbose' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True)
|
||||
return
|
||||
|
||||
# Engines are verbose in different ways.
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True)
|
||||
captured = capsys.readouterr()
|
||||
|
||||
if parser.engine == "c":
|
||||
assert "Tokenization took:" in captured.out
|
||||
assert "Parser memory cleanup took:" in captured.out
|
||||
else: # Python engine
|
||||
assert captured.out == "Filled 3 NA values in column a\n"
|
||||
|
||||
|
||||
def test_verbose_read2(all_parsers, capsys):
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
one,1,2,3
|
||||
two,1,2,3
|
||||
three,1,2,3
|
||||
four,1,2,3
|
||||
five,1,2,3
|
||||
,1,2,3
|
||||
seven,1,2,3
|
||||
eight,1,2,3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'verbose' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True, index_col=0)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True, index_col=0)
|
||||
captured = capsys.readouterr()
|
||||
|
||||
# Engines are verbose in different ways.
|
||||
if parser.engine == "c":
|
||||
assert "Tokenization took:" in captured.out
|
||||
assert "Parser memory cleanup took:" in captured.out
|
||||
else: # Python engine
|
||||
assert captured.out == "Filled 1 NA values in column a\n"
|
||||
@@ -0,0 +1,337 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import HAS_PYARROW
|
||||
from pandas.compat._optional import VERSIONS
|
||||
|
||||
from pandas import (
|
||||
read_csv,
|
||||
read_table,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class BaseParser:
|
||||
engine: str | None = None
|
||||
low_memory = True
|
||||
float_precision_choices: list[str | None] = []
|
||||
|
||||
def update_kwargs(self, kwargs):
|
||||
kwargs = kwargs.copy()
|
||||
kwargs.update({"engine": self.engine, "low_memory": self.low_memory})
|
||||
|
||||
return kwargs
|
||||
|
||||
def read_csv(self, *args, **kwargs):
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
return read_csv(*args, **kwargs)
|
||||
|
||||
def read_csv_check_warnings(
|
||||
self,
|
||||
warn_type: type[Warning],
|
||||
warn_msg: str,
|
||||
*args,
|
||||
raise_on_extra_warnings=True,
|
||||
check_stacklevel: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
# We need to check the stacklevel here instead of in the tests
|
||||
# since this is where read_csv is called and where the warning
|
||||
# should point to.
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
with tm.assert_produces_warning(
|
||||
warn_type,
|
||||
match=warn_msg,
|
||||
raise_on_extra_warnings=raise_on_extra_warnings,
|
||||
check_stacklevel=check_stacklevel,
|
||||
):
|
||||
return read_csv(*args, **kwargs)
|
||||
|
||||
def read_table(self, *args, **kwargs):
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
return read_table(*args, **kwargs)
|
||||
|
||||
def read_table_check_warnings(
|
||||
self,
|
||||
warn_type: type[Warning],
|
||||
warn_msg: str,
|
||||
*args,
|
||||
raise_on_extra_warnings=True,
|
||||
**kwargs,
|
||||
):
|
||||
# We need to check the stacklevel here instead of in the tests
|
||||
# since this is where read_table is called and where the warning
|
||||
# should point to.
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
with tm.assert_produces_warning(
|
||||
warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings
|
||||
):
|
||||
return read_table(*args, **kwargs)
|
||||
|
||||
|
||||
class CParser(BaseParser):
|
||||
engine = "c"
|
||||
float_precision_choices = [None, "high", "round_trip"]
|
||||
|
||||
|
||||
class CParserHighMemory(CParser):
|
||||
low_memory = False
|
||||
|
||||
|
||||
class CParserLowMemory(CParser):
|
||||
low_memory = True
|
||||
|
||||
|
||||
class PythonParser(BaseParser):
|
||||
engine = "python"
|
||||
float_precision_choices = [None]
|
||||
|
||||
|
||||
class PyArrowParser(BaseParser):
|
||||
engine = "pyarrow"
|
||||
float_precision_choices = [None]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def csv_dir_path(datapath):
|
||||
"""
|
||||
The directory path to the data files needed for parser tests.
|
||||
"""
|
||||
return datapath("io", "parser", "data")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def csv1(datapath):
|
||||
"""
|
||||
The path to the data file "test1.csv" needed for parser tests.
|
||||
"""
|
||||
return os.path.join(datapath("io", "data", "csv"), "test1.csv")
|
||||
|
||||
|
||||
_cParserHighMemory = CParserHighMemory
|
||||
_cParserLowMemory = CParserLowMemory
|
||||
_pythonParser = PythonParser
|
||||
_pyarrowParser = PyArrowParser
|
||||
|
||||
_py_parsers_only = [_pythonParser]
|
||||
_c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
|
||||
_pyarrow_parsers_only = [
|
||||
pytest.param(
|
||||
_pyarrowParser,
|
||||
marks=[
|
||||
pytest.mark.single_cpu,
|
||||
pytest.mark.skipif(not HAS_PYARROW, reason="pyarrow is not installed"),
|
||||
],
|
||||
)
|
||||
]
|
||||
|
||||
_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
|
||||
|
||||
_py_parser_ids = ["python"]
|
||||
_c_parser_ids = ["c_high", "c_low"]
|
||||
_pyarrow_parsers_ids = ["pyarrow"]
|
||||
|
||||
_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parsers_ids]
|
||||
|
||||
|
||||
@pytest.fixture(params=_all_parsers, ids=_all_parser_ids)
|
||||
def all_parsers(request):
|
||||
"""
|
||||
Fixture all of the CSV parsers.
|
||||
"""
|
||||
parser = request.param()
|
||||
if parser.engine == "pyarrow":
|
||||
pytest.importorskip("pyarrow", VERSIONS["pyarrow"])
|
||||
# Try finding a way to disable threads all together
|
||||
# for more stable CI runs
|
||||
import pyarrow
|
||||
|
||||
pyarrow.set_cpu_count(1)
|
||||
return parser
|
||||
|
||||
|
||||
@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids)
|
||||
def c_parser_only(request):
|
||||
"""
|
||||
Fixture all of the CSV parsers using the C engine.
|
||||
"""
|
||||
return request.param()
|
||||
|
||||
|
||||
@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids)
|
||||
def python_parser_only(request):
|
||||
"""
|
||||
Fixture all of the CSV parsers using the Python engine.
|
||||
"""
|
||||
return request.param()
|
||||
|
||||
|
||||
@pytest.fixture(params=_pyarrow_parsers_only, ids=_pyarrow_parsers_ids)
|
||||
def pyarrow_parser_only(request):
|
||||
"""
|
||||
Fixture all of the CSV parsers using the Pyarrow engine.
|
||||
"""
|
||||
return request.param()
|
||||
|
||||
|
||||
def _get_all_parser_float_precision_combinations():
|
||||
"""
|
||||
Return all allowable parser and float precision
|
||||
combinations and corresponding ids.
|
||||
"""
|
||||
params = []
|
||||
ids = []
|
||||
for parser, parser_id in zip(_all_parsers, _all_parser_ids):
|
||||
if hasattr(parser, "values"):
|
||||
# Wrapped in pytest.param, get the actual parser back
|
||||
parser = parser.values[0]
|
||||
for precision in parser.float_precision_choices:
|
||||
# Re-wrap in pytest.param for pyarrow
|
||||
mark = (
|
||||
[
|
||||
pytest.mark.single_cpu,
|
||||
pytest.mark.skipif(
|
||||
not HAS_PYARROW, reason="pyarrow is not installed"
|
||||
),
|
||||
]
|
||||
if parser.engine == "pyarrow"
|
||||
else ()
|
||||
)
|
||||
param = pytest.param((parser(), precision), marks=mark)
|
||||
params.append(param)
|
||||
ids.append(f"{parser_id}-{precision}")
|
||||
|
||||
return {"params": params, "ids": ids}
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=_get_all_parser_float_precision_combinations()["params"],
|
||||
ids=_get_all_parser_float_precision_combinations()["ids"],
|
||||
)
|
||||
def all_parsers_all_precisions(request):
|
||||
"""
|
||||
Fixture for all allowable combinations of parser
|
||||
and float precision
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
_utf_values = [8, 16, 32]
|
||||
|
||||
_encoding_seps = ["", "-", "_"]
|
||||
_encoding_prefixes = ["utf", "UTF"]
|
||||
|
||||
_encoding_fmts = [
|
||||
f"{prefix}{sep}{{0}}" for sep in _encoding_seps for prefix in _encoding_prefixes
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(params=_utf_values)
|
||||
def utf_value(request):
|
||||
"""
|
||||
Fixture for all possible integer values for a UTF encoding.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=_encoding_fmts)
|
||||
def encoding_fmt(request):
|
||||
"""
|
||||
Fixture for all possible string formats of a UTF encoding.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
("-1,0", -1.0),
|
||||
("-1,2e0", -1.2),
|
||||
("-1e0", -1.0),
|
||||
("+1e0", 1.0),
|
||||
("+1e+0", 1.0),
|
||||
("+1e-1", 0.1),
|
||||
("+,1e1", 1.0),
|
||||
("+1,e0", 1.0),
|
||||
("-,1e1", -1.0),
|
||||
("-1,e0", -1.0),
|
||||
("0,1", 0.1),
|
||||
("1,", 1.0),
|
||||
(",1", 0.1),
|
||||
("-,1", -0.1),
|
||||
("1_,", 1.0),
|
||||
("1_234,56", 1234.56),
|
||||
("1_234,56e0", 1234.56),
|
||||
# negative cases; must not parse as float
|
||||
("_", "_"),
|
||||
("-_", "-_"),
|
||||
("-_1", "-_1"),
|
||||
("-_1e0", "-_1e0"),
|
||||
("_1", "_1"),
|
||||
("_1,", "_1,"),
|
||||
("_1,_", "_1,_"),
|
||||
("_1e0", "_1e0"),
|
||||
("1,2e_1", "1,2e_1"),
|
||||
("1,2e1_0", "1,2e1_0"),
|
||||
("1,_2", "1,_2"),
|
||||
(",1__2", ",1__2"),
|
||||
(",1e", ",1e"),
|
||||
("-,1e", "-,1e"),
|
||||
("1_000,000_000", "1_000,000_000"),
|
||||
("1,e1_2", "1,e1_2"),
|
||||
("e11,2", "e11,2"),
|
||||
("1e11,2", "1e11,2"),
|
||||
("1,2,2", "1,2,2"),
|
||||
("1,2_1", "1,2_1"),
|
||||
("1,2e-10e1", "1,2e-10e1"),
|
||||
("--1,2", "--1,2"),
|
||||
("1a_2,1", "1a_2,1"),
|
||||
("1,2E-1", 0.12),
|
||||
("1,2E1", 12.0),
|
||||
]
|
||||
)
|
||||
def numeric_decimal(request):
|
||||
"""
|
||||
Fixture for all numeric formats which should get recognized. The first entry
|
||||
represents the value to read while the second represents the expected result.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pyarrow_xfail(request):
|
||||
"""
|
||||
Fixture that xfails a test if the engine is pyarrow.
|
||||
|
||||
Use if failure is do to unsupported keywords or inconsistent results.
|
||||
"""
|
||||
if "all_parsers" in request.fixturenames:
|
||||
parser = request.getfixturevalue("all_parsers")
|
||||
elif "all_parsers_all_precisions" in request.fixturenames:
|
||||
# Return value is tuple of (engine, precision)
|
||||
parser = request.getfixturevalue("all_parsers_all_precisions")[0]
|
||||
else:
|
||||
return
|
||||
if parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
|
||||
request.applymarker(mark)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pyarrow_skip(request):
|
||||
"""
|
||||
Fixture that skips a test if the engine is pyarrow.
|
||||
|
||||
Use if failure is do a parsing failure from pyarrow.csv.read_csv
|
||||
"""
|
||||
if "all_parsers" in request.fixturenames:
|
||||
parser = request.getfixturevalue("all_parsers")
|
||||
elif "all_parsers_all_precisions" in request.fixturenames:
|
||||
# Return value is tuple of (engine, precision)
|
||||
parser = request.getfixturevalue("all_parsers_all_precisions")[0]
|
||||
else:
|
||||
return
|
||||
if parser.engine == "pyarrow":
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,334 @@
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import parsers as libparsers
|
||||
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"category",
|
||||
CategoricalDtype(),
|
||||
{"a": "category", "b": "category", "c": CategoricalDtype()},
|
||||
],
|
||||
)
|
||||
def test_categorical_dtype(all_parsers, dtype):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["a", "a", "b"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"]),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
|
||||
def test_categorical_dtype_single(all_parsers, dtype, request):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = DataFrame(
|
||||
{"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(
|
||||
strict=False,
|
||||
reason="Flaky test sometimes gives object dtype instead of Categorical",
|
||||
)
|
||||
request.applymarker(mark)
|
||||
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
def test_categorical_dtype_unsorted(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,b,3.4
|
||||
2,a,4.5"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["b", "b", "a"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"]),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
def test_categorical_dtype_missing(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,nan,3.4
|
||||
2,a,4.5"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["b", np.nan, "a"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"]),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
@pytest.mark.slow
|
||||
def test_categorical_dtype_high_cardinality_numeric(all_parsers, monkeypatch):
|
||||
# see gh-18186
|
||||
# was an issue with C parser, due to DEFAULT_BUFFER_HEURISTIC
|
||||
parser = all_parsers
|
||||
heuristic = 2**5
|
||||
data = np.sort([str(i) for i in range(heuristic + 1)])
|
||||
expected = DataFrame({"a": Categorical(data, ordered=True)})
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
|
||||
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
|
||||
actual["a"] = actual["a"].cat.reorder_categories(
|
||||
np.sort(actual.a.cat.categories), ordered=True
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
|
||||
# see gh-10153
|
||||
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
|
||||
parser = all_parsers
|
||||
encoding = "utf-16"
|
||||
sep = "\t"
|
||||
|
||||
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
|
||||
expected = expected.apply(Categorical)
|
||||
|
||||
actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_chunksize_infer_categories(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expecteds = [
|
||||
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
|
||||
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
|
||||
]
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2)
|
||||
return
|
||||
|
||||
with parser.read_csv(
|
||||
StringIO(data), dtype={"b": "category"}, chunksize=2
|
||||
) as actuals:
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
cats = ["a", "b", "c"]
|
||||
expecteds = [
|
||||
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
|
||||
DataFrame(
|
||||
{"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)},
|
||||
index=[2, 3],
|
||||
),
|
||||
]
|
||||
dtype = CategoricalDtype(cats)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
|
||||
# see gh-10153
|
||||
pth = os.path.join(csv_dir_path, "unicode_series.csv")
|
||||
parser = all_parsers
|
||||
encoding = "latin-1"
|
||||
|
||||
expected = parser.read_csv(pth, header=None, encoding=encoding)
|
||||
expected[1] = Categorical(expected[1])
|
||||
|
||||
actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ordered", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
"categories",
|
||||
[["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
|
||||
)
|
||||
def test_categorical_category_dtype(all_parsers, categories, ordered):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(
|
||||
["a", "b", "b", "c"], categories=categories, ordered=ordered
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_category_dtype_unsorted(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
dtype = CategoricalDtype(["c", "b", "a"])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
|
||||
}
|
||||
)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype={"b": dtype})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_numeric(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([1, 2, 3])}
|
||||
|
||||
data = "b\n1\n1\n2\n3"
|
||||
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_datetime(all_parsers):
|
||||
parser = all_parsers
|
||||
dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
|
||||
dtype = {"b": CategoricalDtype(dti)}
|
||||
|
||||
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
|
||||
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_timestamp(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([Timestamp("2014")])}
|
||||
|
||||
data = "b\n2014-01-01\n2014-01-01"
|
||||
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_timedelta(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1h", "2h", "3h"]))}
|
||||
|
||||
data = "b\n1h\n2h\n3h"
|
||||
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
"b\nTrue\nFalse\nNA\nFalse",
|
||||
"b\ntrue\nfalse\nNA\nfalse",
|
||||
"b\nTRUE\nFALSE\nNA\nFALSE",
|
||||
"b\nTrue\nFalse\nNA\nFALSE",
|
||||
],
|
||||
)
|
||||
def test_categorical_dtype_coerces_boolean(all_parsers, data):
|
||||
# see gh-20498
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([False, True])}
|
||||
expected = DataFrame({"b": Categorical([True, False, None, False])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_unexpected_categories(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
|
||||
|
||||
data = "b\nd\na\nc\nd" # Unexpected c
|
||||
expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,644 @@
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from collections import defaultdict
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import IntegerArray
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [str, object])
|
||||
@pytest.mark.parametrize("check_orig", [True, False])
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
|
||||
# see gh-3795, gh-6607
|
||||
parser = all_parsers
|
||||
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((5, 2)).round(4),
|
||||
columns=list("AB"),
|
||||
index=["1A", "1B", "1C", "1D", "1E"],
|
||||
)
|
||||
|
||||
with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
result = parser.read_csv(path, dtype=dtype, index_col=0)
|
||||
|
||||
if check_orig:
|
||||
expected = df.copy()
|
||||
result = result.astype(float)
|
||||
elif using_infer_string and dtype is str:
|
||||
expected = df.astype(str)
|
||||
else:
|
||||
expected = df.astype(str).astype(object)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtype_per_column(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
expected = DataFrame(
|
||||
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
|
||||
)
|
||||
expected["one"] = expected["one"].astype(np.float64)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_invalid_dtype_per_column(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
|
||||
with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"):
|
||||
parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
|
||||
|
||||
|
||||
def test_raise_on_passed_int_dtype_with_nas(all_parsers):
|
||||
# see gh-2631
|
||||
parser = all_parsers
|
||||
data = """YEAR, DOY, a
|
||||
2001,106380451,10
|
||||
2001,,11
|
||||
2001,106380451,67"""
|
||||
|
||||
if parser.engine == "c":
|
||||
msg = "Integer column has NA values"
|
||||
elif parser.engine == "pyarrow":
|
||||
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
|
||||
else:
|
||||
msg = "Unable to convert column DOY"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
|
||||
|
||||
|
||||
def test_dtype_with_converters(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1.1,2.2
|
||||
1.2,2.3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)}
|
||||
)
|
||||
return
|
||||
|
||||
# Dtype spec ignored if converted specified.
|
||||
result = parser.read_csv_check_warnings(
|
||||
ParserWarning,
|
||||
"Both a converter and dtype were specified for column a "
|
||||
"- only the converter will be used.",
|
||||
StringIO(data),
|
||||
dtype={"a": "i8"},
|
||||
converters={"a": lambda x: str(x)},
|
||||
)
|
||||
expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
|
||||
)
|
||||
def test_numeric_dtype(all_parsers, dtype):
|
||||
data = "0\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame([0, 1], dtype=dtype)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_boolean_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "\n".join(
|
||||
[
|
||||
"a",
|
||||
"True",
|
||||
"TRUE",
|
||||
"true",
|
||||
"1",
|
||||
"1.0",
|
||||
"False",
|
||||
"FALSE",
|
||||
"false",
|
||||
"0",
|
||||
"0.0",
|
||||
"NaN",
|
||||
"nan",
|
||||
"NA",
|
||||
"null",
|
||||
"NULL",
|
||||
]
|
||||
)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype="boolean")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.array(
|
||||
[
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
],
|
||||
dtype="boolean",
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_delimiter_with_usecols_and_parse_dates(all_parsers):
|
||||
# GH#35873
|
||||
result = all_parsers.read_csv(
|
||||
StringIO('"dump","-9,1","-9,1",20101010'),
|
||||
engine="python",
|
||||
names=["col", "col1", "col2", "col3"],
|
||||
usecols=["col1", "col2", "col3"],
|
||||
parse_dates=["col3"],
|
||||
decimal=",",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("thousands", ["_", None])
|
||||
def test_decimal_and_exponential(
|
||||
request, python_parser_only, numeric_decimal, thousands
|
||||
):
|
||||
# GH#31920
|
||||
decimal_number_check(request, python_parser_only, numeric_decimal, thousands, None)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("thousands", ["_", None])
|
||||
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
|
||||
def test_1000_sep_decimal_float_precision(
|
||||
request, c_parser_only, numeric_decimal, float_precision, thousands
|
||||
):
|
||||
# test decimal and thousand sep handling in across 'float_precision'
|
||||
# parsers
|
||||
decimal_number_check(
|
||||
request, c_parser_only, numeric_decimal, thousands, float_precision
|
||||
)
|
||||
text, value = numeric_decimal
|
||||
text = " " + text + " "
|
||||
if isinstance(value, str): # the negative cases (parse as text)
|
||||
value = " " + value + " "
|
||||
decimal_number_check(
|
||||
request, c_parser_only, (text, value), thousands, float_precision
|
||||
)
|
||||
|
||||
|
||||
def decimal_number_check(request, parser, numeric_decimal, thousands, float_precision):
|
||||
# GH#31920
|
||||
value = numeric_decimal[0]
|
||||
if thousands is None and value in ("1_,", "1_234,56", "1_234,56e0"):
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason=f"thousands={thousands} and sep is in {value}")
|
||||
)
|
||||
df = parser.read_csv(
|
||||
StringIO(value),
|
||||
float_precision=float_precision,
|
||||
sep="|",
|
||||
thousands=thousands,
|
||||
decimal=",",
|
||||
header=None,
|
||||
)
|
||||
val = df.iloc[0, 0]
|
||||
assert val == numeric_decimal[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
|
||||
def test_skip_whitespace(c_parser_only, float_precision):
|
||||
DATA = """id\tnum\t
|
||||
1\t1.2 \t
|
||||
1\t 2.1\t
|
||||
2\t 1\t
|
||||
2\t 1.2 \t
|
||||
"""
|
||||
df = c_parser_only.read_csv(
|
||||
StringIO(DATA),
|
||||
float_precision=float_precision,
|
||||
sep="\t",
|
||||
header=0,
|
||||
dtype={1: np.float64},
|
||||
)
|
||||
tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num"))
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_true_values_cast_to_bool(all_parsers):
|
||||
# GH#34655
|
||||
text = """a,b
|
||||
yes,xxx
|
||||
no,yyy
|
||||
1,zzz
|
||||
0,aaa
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(text),
|
||||
true_values=["yes"],
|
||||
false_values=["no"],
|
||||
dtype={"a": "boolean"},
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"a": [True, False, True, False], "b": ["xxx", "yyy", "zzz", "aaa"]}
|
||||
)
|
||||
expected["a"] = expected["a"].astype("boolean")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
|
||||
def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
|
||||
# GH#35211
|
||||
parser = all_parsers
|
||||
data = """a,a\n1,1"""
|
||||
dtype_dict = {"a": str, **dtypes}
|
||||
# GH#42462
|
||||
dtype_dict_copy = dtype_dict.copy()
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype_dict)
|
||||
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
|
||||
assert dtype_dict == dtype_dict_copy, "dtype dict changed"
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
|
||||
# GH#42022
|
||||
parser = all_parsers
|
||||
data = """a,a\n1,1"""
|
||||
result = parser.read_csv(StringIO(data), dtype=str)
|
||||
expected = DataFrame({"a": ["1"], "a.1": ["1"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtype_multi_index(all_parsers):
|
||||
# GH 42446
|
||||
parser = all_parsers
|
||||
data = "A,B,B\nX,Y,Z\n1,2,3"
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=list(range(2)),
|
||||
dtype={
|
||||
("A", "X"): np.int32,
|
||||
("B", "Y"): np.int32,
|
||||
("B", "Z"): np.float32,
|
||||
},
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
("A", "X"): np.int32([1]),
|
||||
("B", "Y"): np.int32([2]),
|
||||
("B", "Z"): np.float32([3]),
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
|
||||
# GH 25472
|
||||
parser = all_parsers
|
||||
dtype = any_int_ea_dtype
|
||||
|
||||
data = """a,b,c
|
||||
,3,5
|
||||
1,,6
|
||||
2,4,"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.array([pd.NA, 1, 2], dtype=dtype),
|
||||
"b": pd.array([3, pd.NA, 4], dtype=dtype),
|
||||
"c": pd.array([5, 6, pd.NA], dtype=dtype),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
@pytest.mark.parametrize("default", ["float", "float64"])
|
||||
def test_dtypes_defaultdict(all_parsers, default):
|
||||
# GH#41574
|
||||
data = """a,b
|
||||
1,2
|
||||
"""
|
||||
dtype = defaultdict(lambda: default, a="int64")
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
expected = DataFrame({"a": [1], "b": 2.0})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
|
||||
# GH#41574
|
||||
data = """a,b,a,b,b.1
|
||||
1,2,3,4,5
|
||||
"""
|
||||
dtype = defaultdict(lambda: "float64", a="int64")
|
||||
dtype["b.1"] = "int64"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_dtypes_defaultdict_invalid(all_parsers):
|
||||
# GH#41574
|
||||
data = """a,b
|
||||
1,2
|
||||
"""
|
||||
dtype = defaultdict(lambda: "invalid_dtype", a="int64")
|
||||
parser = all_parsers
|
||||
with pytest.raises(TypeError, match="not understood"):
|
||||
parser.read_csv(StringIO(data), dtype=dtype)
|
||||
|
||||
|
||||
def test_dtype_backend(all_parsers):
|
||||
# GH#36712
|
||||
|
||||
parser = all_parsers
|
||||
|
||||
data = """a,b,c,d,e,f,g,h,i,j
|
||||
1,2.5,True,a,,,,,12-31-2019,
|
||||
3,4.5,False,b,6,7.5,True,a,12-31-2019,
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"]
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 3], dtype="Int64"),
|
||||
"b": pd.Series([2.5, 4.5], dtype="Float64"),
|
||||
"c": pd.Series([True, False], dtype="boolean"),
|
||||
"d": pd.Series(["a", "b"], dtype="string"),
|
||||
"e": pd.Series([pd.NA, 6], dtype="Int64"),
|
||||
"f": pd.Series([pd.NA, 7.5], dtype="Float64"),
|
||||
"g": pd.Series([pd.NA, True], dtype="boolean"),
|
||||
"h": pd.Series([pd.NA, "a"], dtype="string"),
|
||||
"i": pd.Series([Timestamp("2019-12-31")] * 2),
|
||||
"j": pd.Series([pd.NA, pd.NA], dtype="Int64"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_backend_and_dtype(all_parsers):
|
||||
# GH#36712
|
||||
|
||||
parser = all_parsers
|
||||
|
||||
data = """a,b
|
||||
1,2.5
|
||||
,
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data), dtype_backend="numpy_nullable", dtype="float64"
|
||||
)
|
||||
expected = DataFrame({"a": [1.0, np.nan], "b": [2.5, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_backend_string(all_parsers, string_storage):
|
||||
# GH#36712
|
||||
with pd.option_context("mode.string_storage", string_storage):
|
||||
parser = all_parsers
|
||||
|
||||
data = """a,b
|
||||
a,x
|
||||
b,
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)),
|
||||
"b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)),
|
||||
},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_backend_ea_dtype_specified(all_parsers):
|
||||
# GH#491496
|
||||
data = """a,b
|
||||
1,2
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(data), dtype="Int64", dtype_backend="numpy_nullable"
|
||||
)
|
||||
expected = DataFrame({"a": [1], "b": 2}, dtype="Int64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_backend_pyarrow(all_parsers, request):
|
||||
# GH#36712
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
parser = all_parsers
|
||||
|
||||
data = """a,b,c,d,e,f,g,h,i,j
|
||||
1,2.5,True,a,,,,,12-31-2019,
|
||||
3,4.5,False,b,6,7.5,True,a,12-31-2019,
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), dtype_backend="pyarrow", parse_dates=["i"])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 3], dtype="int64[pyarrow]"),
|
||||
"b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"),
|
||||
"c": pd.Series([True, False], dtype="bool[pyarrow]"),
|
||||
"d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
|
||||
"e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"),
|
||||
"f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"),
|
||||
"g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"),
|
||||
"h": pd.Series(
|
||||
[pd.NA, "a"],
|
||||
dtype=pd.ArrowDtype(pa.string()),
|
||||
),
|
||||
"i": pd.Series([Timestamp("2019-12-31")] * 2),
|
||||
"j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# pyarrow engine failing:
|
||||
# https://github.com/pandas-dev/pandas/issues/56136
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
def test_ea_int_avoid_overflow(all_parsers):
|
||||
# GH#32134
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,1
|
||||
,1
|
||||
1582218195625938945,1
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), dtype={"a": "Int64"})
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": IntegerArray(
|
||||
np.array([1, 1, 1582218195625938945]), np.array([False, True, False])
|
||||
),
|
||||
"b": 1,
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_string_inference(all_parsers):
|
||||
# GH#54430
|
||||
dtype = pd.StringDtype(na_value=np.nan)
|
||||
|
||||
data = """a,b
|
||||
x,1
|
||||
y,2
|
||||
,3"""
|
||||
parser = all_parsers
|
||||
with pd.option_context("future.infer_string", True):
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": pd.Series(["x", "y", None], dtype=dtype), "b": [1, 2, 3]},
|
||||
columns=pd.Index(["a", "b"], dtype=dtype),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
|
||||
def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
|
||||
# GH#56047
|
||||
data = """a,b
|
||||
x,a
|
||||
y,a
|
||||
z,a"""
|
||||
parser = all_parsers
|
||||
with pd.option_context("future.infer_string", True):
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
|
||||
expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
|
||||
"b": pd.Series(["a", "a", "a"], dtype=expected_dtype),
|
||||
},
|
||||
columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pd.option_context("future.infer_string", True):
|
||||
result = parser.read_csv(StringIO(data), dtype={"a": dtype})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
|
||||
"b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)),
|
||||
},
|
||||
columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow
|
||||
def test_accurate_parsing_of_large_integers(all_parsers):
|
||||
# GH#52505
|
||||
data = """SYMBOL,MOMENT,ID,ID_DEAL
|
||||
AAPL,20230301181139587,1925036343869802844,
|
||||
AAPL,20230301181139587,2023552585717889863,2023552585717263358
|
||||
NVDA,20230301181139587,2023552585717889863,2023552585717263359
|
||||
AMC,20230301181139587,2023552585717889863,2023552585717263360
|
||||
AMZN,20230301181139587,2023552585717889759,2023552585717263360
|
||||
MSFT,20230301181139587,2023552585717889863,2023552585717263361
|
||||
NVDA,20230301181139587,2023552585717889827,2023552585717263361"""
|
||||
orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
|
||||
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1
|
||||
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1
|
||||
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2
|
||||
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263361, "ID_DEAL"]) == 2
|
||||
|
||||
|
||||
def test_dtypes_with_usecols(all_parsers):
|
||||
# GH#54868
|
||||
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,2,3
|
||||
4,5,6"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["a", "c"], dtype={"a": object})
|
||||
if parser.engine == "pyarrow":
|
||||
values = [1, 4]
|
||||
else:
|
||||
values = ["1", "4"]
|
||||
expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_col_with_dtype_no_rangeindex(all_parsers):
|
||||
data = StringIO("345.5,519.5,0\n519.5,726.5,1")
|
||||
result = all_parsers.read_csv(
|
||||
data,
|
||||
header=None,
|
||||
names=["start", "stop", "bin_id"],
|
||||
dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32},
|
||||
index_col="bin_id",
|
||||
).index
|
||||
expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
|
||||
tm.assert_index_equal(result, expected)
|
||||
@@ -0,0 +1,181 @@
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_dtype_all_columns_empty(all_parsers):
|
||||
# see gh-12048
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO("A,B"), dtype=str)
|
||||
|
||||
expected = DataFrame({"A": [], "B": []}, dtype=str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two"
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
|
||||
|
||||
expected = DataFrame(
|
||||
{"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_index_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two"
|
||||
result = parser.read_csv(
|
||||
StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"}
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_multi_index_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two,three"
|
||||
result = parser.read_csv(
|
||||
StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"}
|
||||
)
|
||||
|
||||
exp_idx = MultiIndex.from_arrays(
|
||||
[np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)],
|
||||
names=["one", "two"],
|
||||
)
|
||||
expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
|
||||
|
||||
expected = DataFrame(
|
||||
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
||||
|
||||
expected = DataFrame(
|
||||
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
|
||||
# see gh-9424
|
||||
parser = all_parsers
|
||||
expected = concat(
|
||||
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
|
||||
# see gh-9424
|
||||
parser = all_parsers
|
||||
expected = concat(
|
||||
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
|
||||
axis=1,
|
||||
)
|
||||
expected.index = expected.index.astype(object)
|
||||
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
data = ""
|
||||
parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,expected",
|
||||
[
|
||||
(np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
|
||||
(
|
||||
"category",
|
||||
DataFrame({"a": Categorical([]), "b": Categorical([])}),
|
||||
),
|
||||
(
|
||||
{"a": "category", "b": "category"},
|
||||
DataFrame({"a": Categorical([]), "b": Categorical([])}),
|
||||
),
|
||||
("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
|
||||
(
|
||||
"timedelta64[ns]",
|
||||
DataFrame(
|
||||
{
|
||||
"a": Series([], dtype="timedelta64[ns]"),
|
||||
"b": Series([], dtype="timedelta64[ns]"),
|
||||
},
|
||||
),
|
||||
),
|
||||
(
|
||||
{"a": np.int64, "b": np.int32},
|
||||
DataFrame(
|
||||
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
||||
),
|
||||
),
|
||||
(
|
||||
{0: np.int64, 1: np.int32},
|
||||
DataFrame(
|
||||
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
||||
),
|
||||
),
|
||||
(
|
||||
{"a": np.int64, 1: np.int32},
|
||||
DataFrame(
|
||||
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_dtype(all_parsers, dtype, expected):
|
||||
# see gh-14712
|
||||
parser = all_parsers
|
||||
data = "a,b"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,647 @@
|
||||
"""
|
||||
Tests that apply specifically to the CParser. Unless specifically stated
|
||||
as a CParser-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the Python parser can accept
|
||||
further arguments when parsing.
|
||||
"""
|
||||
from decimal import Decimal
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
TextIOWrapper,
|
||||
)
|
||||
import mmap
|
||||
import os
|
||||
import tarfile
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat.numpy import np_version_gte1p24
|
||||
from pandas.errors import (
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"malformed",
|
||||
["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"],
|
||||
ids=["words pointer", "stream pointer", "lines pointer"],
|
||||
)
|
||||
def test_buffer_overflow(c_parser_only, malformed):
|
||||
# see gh-9205: test certain malformed input files that cause
|
||||
# buffer overflows in tokenizer.c
|
||||
msg = "Buffer overflow caught - possible malformed input file."
|
||||
parser = c_parser_only
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(malformed))
|
||||
|
||||
|
||||
def test_delim_whitespace_custom_terminator(c_parser_only):
|
||||
# See gh-12912
|
||||
data = "a b c~1 2 3~4 5 6~7 8 9"
|
||||
parser = c_parser_only
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_dtype_and_names_error(c_parser_only):
|
||||
# see gh-8833: passing both dtype and names
|
||||
# resulting in an error reporting issue
|
||||
parser = c_parser_only
|
||||
data = """
|
||||
1.0 1
|
||||
2.0 2
|
||||
3.0 3
|
||||
"""
|
||||
# base cases
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"])
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# fallback casting
|
||||
result = parser.read_csv(
|
||||
StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32}
|
||||
)
|
||||
expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"])
|
||||
expected["a"] = expected["a"].astype(np.int32)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = """
|
||||
1.0 1
|
||||
nan 2
|
||||
3.0 3
|
||||
"""
|
||||
# fallback casting, but not castable
|
||||
warning = RuntimeWarning if np_version_gte1p24 else None
|
||||
with pytest.raises(ValueError, match="cannot safely convert"):
|
||||
with tm.assert_produces_warning(warning, check_stacklevel=False):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
sep=r"\s+",
|
||||
header=None,
|
||||
names=["a", "b"],
|
||||
dtype={"a": np.int32},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"match,kwargs",
|
||||
[
|
||||
# For each of these cases, all of the dtypes are valid, just unsupported.
|
||||
(
|
||||
(
|
||||
"the dtype datetime64 is not supported for parsing, "
|
||||
"pass this column using parse_dates instead"
|
||||
),
|
||||
{"dtype": {"A": "datetime64", "B": "float64"}},
|
||||
),
|
||||
(
|
||||
(
|
||||
"the dtype datetime64 is not supported for parsing, "
|
||||
"pass this column using parse_dates instead"
|
||||
),
|
||||
{"dtype": {"A": "datetime64", "B": "float64"}, "parse_dates": ["B"]},
|
||||
),
|
||||
(
|
||||
"the dtype timedelta64 is not supported for parsing",
|
||||
{"dtype": {"A": "timedelta64", "B": "float64"}},
|
||||
),
|
||||
(
|
||||
f"the dtype {tm.ENDIAN}U8 is not supported for parsing",
|
||||
{"dtype": {"A": "U8"}},
|
||||
),
|
||||
],
|
||||
ids=["dt64-0", "dt64-1", "td64", f"{tm.ENDIAN}U8"],
|
||||
)
|
||||
def test_unsupported_dtype(c_parser_only, match, kwargs):
|
||||
parser = c_parser_only
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((5, 2)),
|
||||
columns=list("AB"),
|
||||
index=["1A", "1B", "1C", "1D", "1E"],
|
||||
)
|
||||
|
||||
with tm.ensure_clean("__unsupported_dtype__.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
with pytest.raises(TypeError, match=match):
|
||||
parser.read_csv(path, index_col=0, **kwargs)
|
||||
|
||||
|
||||
@td.skip_if_32bit
|
||||
@pytest.mark.slow
|
||||
# test numbers between 1 and 2
|
||||
@pytest.mark.parametrize("num", np.linspace(1.0, 2.0, num=21))
|
||||
def test_precise_conversion(c_parser_only, num):
|
||||
parser = c_parser_only
|
||||
|
||||
normal_errors = []
|
||||
precise_errors = []
|
||||
|
||||
def error(val: float, actual_val: Decimal) -> Decimal:
|
||||
return abs(Decimal(f"{val:.100}") - actual_val)
|
||||
|
||||
# 25 decimal digits of precision
|
||||
text = f"a\n{num:.25}"
|
||||
|
||||
normal_val = float(
|
||||
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
|
||||
)
|
||||
precise_val = float(parser.read_csv(StringIO(text), float_precision="high")["a"][0])
|
||||
roundtrip_val = float(
|
||||
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
|
||||
)
|
||||
actual_val = Decimal(text[2:])
|
||||
|
||||
normal_errors.append(error(normal_val, actual_val))
|
||||
precise_errors.append(error(precise_val, actual_val))
|
||||
|
||||
# round-trip should match float()
|
||||
assert roundtrip_val == float(text[2:])
|
||||
|
||||
assert sum(precise_errors) <= sum(normal_errors)
|
||||
assert max(precise_errors) <= max(normal_errors)
|
||||
|
||||
|
||||
def test_usecols_dtypes(c_parser_only, using_infer_string):
|
||||
parser = c_parser_only
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=(0, 1, 2),
|
||||
names=("a", "b", "c"),
|
||||
header=None,
|
||||
converters={"a": str},
|
||||
dtype={"b": int, "c": float},
|
||||
)
|
||||
result2 = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=(0, 2),
|
||||
names=("a", "b", "c"),
|
||||
header=None,
|
||||
converters={"a": str},
|
||||
dtype={"b": int, "c": float},
|
||||
)
|
||||
|
||||
if using_infer_string:
|
||||
assert (result.dtypes == ["string", int, float]).all()
|
||||
assert (result2.dtypes == ["string", float]).all()
|
||||
else:
|
||||
assert (result.dtypes == [object, int, float]).all()
|
||||
assert (result2.dtypes == [object, float]).all()
|
||||
|
||||
|
||||
def test_disable_bool_parsing(c_parser_only):
|
||||
# see gh-2090
|
||||
|
||||
parser = c_parser_only
|
||||
data = """A,B,C
|
||||
Yes,No,Yes
|
||||
No,Yes,Yes
|
||||
Yes,,Yes
|
||||
No,No,No"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=object)
|
||||
assert (result.dtypes == object).all()
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=object, na_filter=False)
|
||||
assert result["B"][2] == ""
|
||||
|
||||
|
||||
def test_custom_lineterminator(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = "a,b,c~1,2,3~4,5,6"
|
||||
|
||||
result = parser.read_csv(StringIO(data), lineterminator="~")
|
||||
expected = parser.read_csv(StringIO(data.replace("~", "\n")))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_ragged_csv(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = """1,2,3
|
||||
1,2,3,4
|
||||
1,2,3,4,5
|
||||
1,2
|
||||
1,2,3,4"""
|
||||
|
||||
nice_data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=None, names=["a", "b", "c", "d", "e"]
|
||||
)
|
||||
|
||||
expected = parser.read_csv(
|
||||
StringIO(nice_data), header=None, names=["a", "b", "c", "d", "e"]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# too many columns, cause segfault if not careful
|
||||
data = "1,2\n3,4,5"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, names=range(50))
|
||||
expected = parser.read_csv(StringIO(data), header=None, names=range(3)).reindex(
|
||||
columns=range(50)
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_tokenize_CR_with_quoting(c_parser_only):
|
||||
# see gh-3453
|
||||
parser = c_parser_only
|
||||
data = ' a,b,c\r"a,b","e,d","f,f"'
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = parser.read_csv(StringIO(data.replace("\r", "\n")), header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = parser.read_csv(StringIO(data.replace("\r", "\n")))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("count", [3 * 2**n for n in range(6)])
|
||||
def test_grow_boundary_at_cap(c_parser_only, count):
|
||||
# See gh-12494
|
||||
#
|
||||
# Cause of error was that the C parser
|
||||
# was not increasing the buffer size when
|
||||
# the desired space would fill the buffer
|
||||
# to capacity, which would later cause a
|
||||
# buffer overflow error when checking the
|
||||
# EOF terminator of the CSV stream.
|
||||
# 3 * 2^n commas was observed to break the parser
|
||||
parser = c_parser_only
|
||||
|
||||
with StringIO("," * count) as s:
|
||||
expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
|
||||
df = parser.read_csv(s)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_parse_trim_buffers(c_parser_only, encoding):
|
||||
# This test is part of a bugfix for gh-13703. It attempts to
|
||||
# to stress the system memory allocator, to cause it to move the
|
||||
# stream buffer and either let the OS reclaim the region, or let
|
||||
# other memory requests of parser otherwise modify the contents
|
||||
# of memory space, where it was formally located.
|
||||
# This test is designed to cause a `segfault` with unpatched
|
||||
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
|
||||
# times it fails due to memory corruption, which causes the
|
||||
# loaded DataFrame to differ from the expected one.
|
||||
|
||||
# Also force 'utf-8' encoding, so that `_string_convert` would take
|
||||
# a different execution branch.
|
||||
|
||||
parser = c_parser_only
|
||||
|
||||
# Generate a large mixed-type CSV file on-the-fly (one record is
|
||||
# approx 1.5KiB).
|
||||
record_ = (
|
||||
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z"""
|
||||
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,"""
|
||||
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9"""
|
||||
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,"""
|
||||
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9."""
|
||||
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999."""
|
||||
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ"""
|
||||
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ"""
|
||||
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z"""
|
||||
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,"""
|
||||
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,"""
|
||||
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,"""
|
||||
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999"""
|
||||
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9."""
|
||||
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,"""
|
||||
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z"""
|
||||
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ"""
|
||||
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99"""
|
||||
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-"""
|
||||
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9"""
|
||||
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,"""
|
||||
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9."""
|
||||
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ"""
|
||||
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ"""
|
||||
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ"""
|
||||
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ"""
|
||||
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99"""
|
||||
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9"""
|
||||
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
|
||||
)
|
||||
|
||||
# Set the number of lines so that a call to `parser_trim_buffers`
|
||||
# is triggered: after a couple of full chunks are consumed a
|
||||
# relatively small 'residual' chunk would cause reallocation
|
||||
# within the parser.
|
||||
chunksize, n_lines = 128, 2 * 128 + 15
|
||||
csv_data = "\n".join([record_] * n_lines) + "\n"
|
||||
|
||||
# We will use StringIO to load the CSV from this text buffer.
|
||||
# pd.read_csv() will iterate over the file in chunks and will
|
||||
# finally read a residual chunk of really small size.
|
||||
|
||||
# Generate the expected output: manually create the dataframe
|
||||
# by splitting by comma and repeating the `n_lines` times.
|
||||
row = tuple(val_ if val_ else np.nan for val_ in record_.split(","))
|
||||
expected = DataFrame(
|
||||
[row for _ in range(n_lines)], dtype=object, columns=None, index=None
|
||||
)
|
||||
|
||||
# Iterate over the CSV file in chunks of `chunksize` lines
|
||||
with parser.read_csv(
|
||||
StringIO(csv_data),
|
||||
header=None,
|
||||
dtype=object,
|
||||
chunksize=chunksize,
|
||||
encoding=encoding,
|
||||
) as chunks_:
|
||||
result = concat(chunks_, axis=0, ignore_index=True)
|
||||
|
||||
# Check for data corruption if there was no segfault
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_null_byte(c_parser_only):
|
||||
# see gh-14012
|
||||
#
|
||||
# The null byte ('\x00') should not be used as a
|
||||
# true line terminator, escape character, or comment
|
||||
# character, only as a placeholder to indicate that
|
||||
# none was specified.
|
||||
#
|
||||
# This test should be moved to test_common.py ONLY when
|
||||
# Python's csv class supports parsing '\x00'.
|
||||
parser = c_parser_only
|
||||
|
||||
names = ["a", "b", "c"]
|
||||
data = "1,2,3\n4,\x00,6\n7,8,9"
|
||||
expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6], [7, 8, 9]], columns=names)
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_nrows_large(c_parser_only):
|
||||
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
|
||||
parser = c_parser_only
|
||||
header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n"
|
||||
data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n"
|
||||
header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n"
|
||||
data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n"
|
||||
test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2
|
||||
|
||||
df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010)
|
||||
|
||||
assert df.size == 1010 * 10
|
||||
|
||||
|
||||
def test_float_precision_round_trip_with_text(c_parser_only):
|
||||
# see gh-15140
|
||||
parser = c_parser_only
|
||||
df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip")
|
||||
tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
|
||||
|
||||
|
||||
def test_large_difference_in_columns(c_parser_only):
|
||||
# see gh-14125
|
||||
parser = c_parser_only
|
||||
|
||||
count = 10000
|
||||
large_row = ("X," * count)[:-1] + "\n"
|
||||
normal_row = "XXXXXX XXXXXX,111111111111111\n"
|
||||
test_input = (large_row + normal_row * 6)[:-1]
|
||||
|
||||
result = parser.read_csv(StringIO(test_input), header=None, usecols=[0])
|
||||
rows = test_input.split("\n")
|
||||
|
||||
expected = DataFrame([row.split(",")[0] for row in rows])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_after_quote(c_parser_only):
|
||||
# see gh-15910
|
||||
parser = c_parser_only
|
||||
|
||||
data = 'a\n1\n"b"a'
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame({"a": ["1", "ba"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_whitespace_delimited(c_parser_only):
|
||||
parser = c_parser_only
|
||||
test_input = """\
|
||||
1 2
|
||||
2 2 3
|
||||
3 2 3 # 3 fields
|
||||
4 2 3# 3 fields
|
||||
5 2 # 2 fields
|
||||
6 2# 2 fields
|
||||
7 # 1 field, NaN
|
||||
8# 1 field, NaN
|
||||
9 2 3 # skipped line
|
||||
# comment"""
|
||||
with tm.assert_produces_warning(
|
||||
ParserWarning, match="Skipping line", check_stacklevel=False
|
||||
):
|
||||
df = parser.read_csv(
|
||||
StringIO(test_input),
|
||||
comment="#",
|
||||
header=None,
|
||||
delimiter="\\s+",
|
||||
skiprows=0,
|
||||
on_bad_lines="warn",
|
||||
)
|
||||
expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_file_like_no_next(c_parser_only):
|
||||
# gh-16530: the file-like need not have a "next" or "__next__"
|
||||
# attribute despite having an "__iter__" attribute.
|
||||
#
|
||||
# NOTE: This is only true for the C engine, not Python engine.
|
||||
class NoNextBuffer(StringIO):
|
||||
def __next__(self):
|
||||
raise AttributeError("No next method")
|
||||
|
||||
next = __next__
|
||||
|
||||
parser = c_parser_only
|
||||
data = "a\n1"
|
||||
|
||||
expected = DataFrame({"a": [1]})
|
||||
result = parser.read_csv(NoNextBuffer(data))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
|
||||
# see gh-22748
|
||||
t = BytesIO(b"\xB0")
|
||||
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
|
||||
msg = "'utf-8' codec can't encode character"
|
||||
with pytest.raises(UnicodeError, match=msg):
|
||||
c_parser_only.read_csv(t, encoding="UTF-8")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
|
||||
def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
|
||||
# see gh-16530
|
||||
#
|
||||
# Unfortunately, Python's CSV library can't handle
|
||||
# tarfile objects (expects string, not bytes when
|
||||
# iterating through a file-like).
|
||||
parser = c_parser_only
|
||||
tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)
|
||||
|
||||
with tarfile.open(tar_path, "r") as tar:
|
||||
data_file = tar.extractfile("tar_data.csv")
|
||||
|
||||
out = parser.read_csv(data_file)
|
||||
expected = DataFrame({"a": [1]})
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
|
||||
def test_chunk_whitespace_on_boundary(c_parser_only):
|
||||
# see gh-9735: this issue is C parser-specific (bug when
|
||||
# parsing whitespace and characters at chunk boundary)
|
||||
#
|
||||
# This test case has a field too large for the Python parser / CSV library.
|
||||
parser = c_parser_only
|
||||
|
||||
chunk1 = "a" * (1024 * 256 - 2) + "\na"
|
||||
chunk2 = "\n a"
|
||||
result = parser.read_csv(StringIO(chunk1 + chunk2), header=None)
|
||||
|
||||
expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_file_handles_mmap(c_parser_only, csv1):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = c_parser_only
|
||||
|
||||
with open(csv1, encoding="utf-8") as f:
|
||||
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
|
||||
parser.read_csv(m)
|
||||
assert not m.closed
|
||||
|
||||
|
||||
def test_file_binary_mode(c_parser_only):
|
||||
# see gh-23779
|
||||
parser = c_parser_only
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write("1,2,3\n4,5,6")
|
||||
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_unix_style_breaks(c_parser_only):
|
||||
# GH 11020
|
||||
parser = c_parser_only
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", newline="\n", encoding="utf-8") as f:
|
||||
f.write("blah\n\ncol_1,col_2,col_3\n\n")
|
||||
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
|
||||
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
|
||||
@pytest.mark.parametrize(
|
||||
"data,thousands,decimal",
|
||||
[
|
||||
(
|
||||
"""A|B|C
|
||||
1|2,334.01|5
|
||||
10|13|10.
|
||||
""",
|
||||
",",
|
||||
".",
|
||||
),
|
||||
(
|
||||
"""A|B|C
|
||||
1|2.334,01|5
|
||||
10|13|10,
|
||||
""",
|
||||
".",
|
||||
",",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_1000_sep_with_decimal(
|
||||
c_parser_only, data, thousands, decimal, float_precision
|
||||
):
|
||||
parser = c_parser_only
|
||||
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
sep="|",
|
||||
thousands=thousands,
|
||||
decimal=decimal,
|
||||
float_precision=float_precision,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_float_precision_options(c_parser_only):
|
||||
# GH 17154, 36228
|
||||
parser = c_parser_only
|
||||
s = "foo\n243.164\n"
|
||||
df = parser.read_csv(StringIO(s))
|
||||
df2 = parser.read_csv(StringIO(s), float_precision="high")
|
||||
|
||||
tm.assert_frame_equal(df, df2)
|
||||
|
||||
df3 = parser.read_csv(StringIO(s), float_precision="legacy")
|
||||
|
||||
assert not df.iloc[0, 0] == df3.iloc[0, 0]
|
||||
|
||||
msg = "Unrecognized float_precision option: junk"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(s), float_precision="junk")
|
||||
@@ -0,0 +1,227 @@
|
||||
"""
|
||||
Tests that comments are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", [None, ["NaN"]])
|
||||
def test_comment(all_parsers, na_values):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
1,2.,4.#hello world
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", na_values=na_values)
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), comment="#", na_values=na_values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}]
|
||||
)
|
||||
def test_line_comment(all_parsers, read_kwargs, request):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
A,B,C
|
||||
1,2.,4.#hello world
|
||||
#ignore this line
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
warn = None
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if read_kwargs.get("delim_whitespace"):
|
||||
data = data.replace(",", " ")
|
||||
warn = FutureWarning
|
||||
elif read_kwargs.get("lineterminator"):
|
||||
data = data.replace("\n", read_kwargs.get("lineterminator"))
|
||||
|
||||
read_kwargs["comment"] = "#"
|
||||
if parser.engine == "pyarrow":
|
||||
if "lineterminator" in read_kwargs:
|
||||
msg = (
|
||||
"The 'lineterminator' option is not supported with the 'pyarrow' engine"
|
||||
)
|
||||
else:
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
warn, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), **read_kwargs)
|
||||
return
|
||||
elif parser.engine == "python" and read_kwargs.get("lineterminator"):
|
||||
msg = r"Custom line terminators not supported in python parser \(yet\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
warn, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), **read_kwargs)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False):
|
||||
result = parser.read_csv(StringIO(data), **read_kwargs)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_skiprows(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
random line
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# This should ignore the first four lines (including comments).
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", skiprows=4)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# Header should begin at the second non-comment line.
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", header=1)
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), comment="#", header=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_skiprows_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
# second empty line
|
||||
# third empty line
|
||||
X,Y,Z
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# Skiprows should skip the first 4 lines (including comments),
|
||||
# while header should start from the second non-commented line,
|
||||
# starting with line 5.
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
|
||||
def test_custom_comment_char(all_parsers, comment_char):
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data.replace("#", comment_char)), comment=comment_char
|
||||
)
|
||||
return
|
||||
result = parser.read_csv(
|
||||
StringIO(data.replace("#", comment_char)), comment=comment_char
|
||||
)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", ["infer", None])
|
||||
def test_comment_first_line(all_parsers, header):
|
||||
# see gh-4623
|
||||
parser = all_parsers
|
||||
data = "# notes\na,b,c\n# more notes\n1,2,3"
|
||||
|
||||
if header is None:
|
||||
expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
|
||||
else:
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", header=header)
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), comment="#", header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_char_in_default_value(all_parsers, request):
|
||||
# GH#34002
|
||||
if all_parsers.engine == "c":
|
||||
reason = "see gh-34002: works on the python engine but not the c engine"
|
||||
# NA value containing comment char is interpreted as comment
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=AssertionError))
|
||||
parser = all_parsers
|
||||
|
||||
data = (
|
||||
"# this is a comment\n"
|
||||
"col1,col2,col3,col4\n"
|
||||
"1,2,3,4#inline comment\n"
|
||||
"4,5#,6,10\n"
|
||||
"7,8,#N/A,11\n"
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": [1, 4, 7],
|
||||
"col2": [2, 5, 8],
|
||||
"col3": [3.0, np.nan, np.nan],
|
||||
"col4": [4.0, np.nan, 11.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,211 @@
|
||||
"""
|
||||
Tests compressed data parsing functionality for all
|
||||
of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import tarfile
|
||||
import zipfile
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def buffer(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser_and_data(all_parsers, csv1):
|
||||
parser = all_parsers
|
||||
|
||||
with open(csv1, "rb") as f:
|
||||
data = f.read()
|
||||
expected = parser.read_csv(csv1)
|
||||
|
||||
return parser, data, expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
|
||||
def test_zip(parser_and_data, compression):
|
||||
parser, data, expected = parser_and_data
|
||||
|
||||
with tm.ensure_clean("test_file.zip") as path:
|
||||
with zipfile.ZipFile(path, mode="w") as tmp:
|
||||
tmp.writestr("test_file", data)
|
||||
|
||||
if compression == "zip2":
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, compression="zip")
|
||||
else:
|
||||
result = parser.read_csv(path, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer"])
|
||||
def test_zip_error_multiple_files(parser_and_data, compression):
|
||||
parser, data, expected = parser_and_data
|
||||
|
||||
with tm.ensure_clean("combined_zip.zip") as path:
|
||||
inner_file_names = ["test_file", "second_file"]
|
||||
|
||||
with zipfile.ZipFile(path, mode="w") as tmp:
|
||||
for file_name in inner_file_names:
|
||||
tmp.writestr(file_name, data)
|
||||
|
||||
with pytest.raises(ValueError, match="Multiple files"):
|
||||
parser.read_csv(path, compression=compression)
|
||||
|
||||
|
||||
def test_zip_error_no_files(parser_and_data):
|
||||
parser, _, _ = parser_and_data
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with zipfile.ZipFile(path, mode="w"):
|
||||
pass
|
||||
|
||||
with pytest.raises(ValueError, match="Zero files"):
|
||||
parser.read_csv(path, compression="zip")
|
||||
|
||||
|
||||
def test_zip_error_invalid_zip(parser_and_data):
|
||||
parser, _, _ = parser_and_data
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "rb") as f:
|
||||
with pytest.raises(zipfile.BadZipFile, match="File is not a zip file"):
|
||||
parser.read_csv(f, compression="zip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [None, "test.{ext}"])
|
||||
def test_compression(
|
||||
request,
|
||||
parser_and_data,
|
||||
compression_only,
|
||||
buffer,
|
||||
filename,
|
||||
compression_to_extension,
|
||||
):
|
||||
parser, data, expected = parser_and_data
|
||||
compress_type = compression_only
|
||||
|
||||
ext = compression_to_extension[compress_type]
|
||||
filename = filename if filename is None else filename.format(ext=ext)
|
||||
|
||||
if filename and buffer:
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="Cannot deduce compression from buffer of compressed data."
|
||||
)
|
||||
)
|
||||
|
||||
with tm.ensure_clean(filename=filename) as path:
|
||||
tm.write_to_compressed(compress_type, path, data)
|
||||
compression = "infer" if filename else compress_type
|
||||
|
||||
if buffer:
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, compression=compression)
|
||||
else:
|
||||
result = parser.read_csv(path, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ext", [None, "gz", "bz2"])
|
||||
def test_infer_compression(all_parsers, csv1, buffer, ext):
|
||||
# see gh-9770
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0, "parse_dates": True}
|
||||
|
||||
expected = parser.read_csv(csv1, **kwargs)
|
||||
kwargs["compression"] = "infer"
|
||||
|
||||
if buffer:
|
||||
with open(csv1, encoding="utf-8") as f:
|
||||
result = parser.read_csv(f, **kwargs)
|
||||
else:
|
||||
ext = "." + ext if ext else ""
|
||||
result = parser.read_csv(csv1 + ext, **kwargs)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
|
||||
# see gh-18071, gh-24130
|
||||
parser = all_parsers
|
||||
encoding = encoding_fmt.format(utf_value)
|
||||
path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip")
|
||||
|
||||
result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Country": ["Venezuela", "Venezuela"],
|
||||
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
|
||||
def test_invalid_compression(all_parsers, invalid_compression):
|
||||
parser = all_parsers
|
||||
compress_kwargs = {"compression": invalid_compression}
|
||||
|
||||
msg = f"Unrecognized compression type: {invalid_compression}"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv("test_file.zip", **compress_kwargs)
|
||||
|
||||
|
||||
def test_compression_tar_archive(all_parsers, csv_dir_path):
|
||||
parser = all_parsers
|
||||
path = os.path.join(csv_dir_path, "tar_csv.tar.gz")
|
||||
df = parser.read_csv(path)
|
||||
assert list(df.columns) == ["a"]
|
||||
|
||||
|
||||
def test_ignore_compression_extension(all_parsers):
|
||||
parser = all_parsers
|
||||
df = DataFrame({"a": [0, 1]})
|
||||
with tm.ensure_clean("test.csv") as path_csv:
|
||||
with tm.ensure_clean("test.csv.zip") as path_zip:
|
||||
# make sure to create un-compressed file with zip extension
|
||||
df.to_csv(path_csv, index=False)
|
||||
Path(path_zip).write_text(
|
||||
Path(path_csv).read_text(encoding="utf-8"), encoding="utf-8"
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df)
|
||||
|
||||
|
||||
def test_writes_tar_gz(all_parsers):
|
||||
parser = all_parsers
|
||||
data = DataFrame(
|
||||
{
|
||||
"Country": ["Venezuela", "Venezuela"],
|
||||
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
|
||||
}
|
||||
)
|
||||
with tm.ensure_clean("test.tar.gz") as tar_path:
|
||||
data.to_csv(tar_path, index=False)
|
||||
|
||||
# test that read_csv infers .tar.gz to gzip:
|
||||
tm.assert_frame_equal(parser.read_csv(tar_path), data)
|
||||
|
||||
# test that file is indeed gzipped:
|
||||
with tarfile.open(tar_path, "r:gz") as tar:
|
||||
result = parser.read_csv(
|
||||
tar.extractfile(tar.getnames()[0]), compression="infer"
|
||||
)
|
||||
tm.assert_frame_equal(result, data)
|
||||
@@ -0,0 +1,36 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import DtypeWarning
|
||||
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import ArrowExtensionArray
|
||||
|
||||
from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks
|
||||
|
||||
|
||||
def test_concatenate_chunks_pyarrow():
|
||||
# GH#51876
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
chunks = [
|
||||
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
|
||||
{0: ArrowExtensionArray(pa.array([1, 2]))},
|
||||
]
|
||||
result = _concatenate_chunks(chunks)
|
||||
expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0]))
|
||||
tm.assert_extension_array_equal(result[0], expected)
|
||||
|
||||
|
||||
def test_concatenate_chunks_pyarrow_strings():
|
||||
# GH#51876
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
chunks = [
|
||||
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
|
||||
{0: ArrowExtensionArray(pa.array(["a", "b"]))},
|
||||
]
|
||||
with tm.assert_produces_warning(DtypeWarning, match="have mixed types"):
|
||||
result = _concatenate_chunks(chunks)
|
||||
expected = np.concatenate(
|
||||
[np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])]
|
||||
)
|
||||
tm.assert_numpy_array_equal(result[0], expected)
|
||||
@@ -0,0 +1,263 @@
|
||||
"""
|
||||
Tests column conversion functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
from dateutil.parser import parse
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_converters_type_must_be_dict(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
"""
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), converters=0)
|
||||
return
|
||||
with pytest.raises(TypeError, match="Type converters.+"):
|
||||
parser.read_csv(StringIO(data), converters=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("column", [3, "D"])
|
||||
@pytest.mark.parametrize(
|
||||
"converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer.
|
||||
)
|
||||
def test_converters(all_parsers, column, converter):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
a,1,2,01/01/2009
|
||||
b,3,4,01/02/2009
|
||||
c,4,5,01/03/2009
|
||||
"""
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), converters={column: converter})
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), converters={column: converter})
|
||||
|
||||
expected = parser.read_csv(StringIO(data))
|
||||
expected["D"] = expected["D"].map(converter)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_no_implicit_conv(all_parsers):
|
||||
# see gh-2184
|
||||
parser = all_parsers
|
||||
data = """000102,1.2,A\n001245,2,B"""
|
||||
|
||||
converters = {0: lambda x: x.strip()}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=None, converters=converters)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, converters=converters)
|
||||
|
||||
# Column 0 should not be casted to numeric and should remain as object.
|
||||
expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_euro_decimal_format(all_parsers):
|
||||
# see gh-583
|
||||
converters = {}
|
||||
parser = all_parsers
|
||||
|
||||
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||||
1;1521,1541;187101,9543;ABC;poi;4,7387
|
||||
2;121,12;14897,76;DEF;uyt;0,3773
|
||||
3;878,158;108013,434;GHI;rez;2,7356"""
|
||||
converters["Number1"] = converters["Number2"] = converters[
|
||||
"Number3"
|
||||
] = lambda x: float(x.replace(",", "."))
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=";", converters=converters)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=";", converters=converters)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
|
||||
[2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
|
||||
[3, 878.158, 108013.434, "GHI", "rez", 2.7356],
|
||||
],
|
||||
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_corner_with_nans(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """id,score,days
|
||||
1,2,12
|
||||
2,2-5,
|
||||
3,,14+
|
||||
4,6-12,2"""
|
||||
|
||||
# Example converters.
|
||||
def convert_days(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith("+")
|
||||
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
return x
|
||||
|
||||
def convert_days_sentinel(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith("+")
|
||||
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
return x
|
||||
|
||||
def convert_score(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
if x.find("-") > 0:
|
||||
val_min, val_max = map(int, x.split("-"))
|
||||
val = 0.5 * (val_min + val_max)
|
||||
else:
|
||||
val = float(x)
|
||||
|
||||
return val
|
||||
|
||||
results = []
|
||||
|
||||
for day_converter in [convert_days, convert_days_sentinel]:
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
converters={"score": convert_score, "days": day_converter},
|
||||
na_values=["", None],
|
||||
)
|
||||
continue
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
converters={"score": convert_score, "days": day_converter},
|
||||
na_values=["", None],
|
||||
)
|
||||
assert pd.isna(result["days"][1])
|
||||
results.append(result)
|
||||
|
||||
if parser.engine != "pyarrow":
|
||||
tm.assert_frame_equal(results[0], results[1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
|
||||
def test_converter_index_col_bug(all_parsers, conv_f):
|
||||
# see gh-1835 , GH#40589
|
||||
parser = all_parsers
|
||||
data = "A;B\n1;2\n3;4"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
|
||||
)
|
||||
return
|
||||
|
||||
rs = parser.read_csv(
|
||||
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
|
||||
)
|
||||
|
||||
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
|
||||
def test_converter_identity_object(all_parsers):
|
||||
# GH#40589
|
||||
parser = all_parsers
|
||||
data = "A,B\n1,2\n3,4"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), converters={"A": lambda x: x})
|
||||
return
|
||||
|
||||
rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x})
|
||||
|
||||
xp = DataFrame({"A": ["1", "3"], "B": [2, 4]})
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
|
||||
def test_converter_multi_index(all_parsers):
|
||||
# GH 42446
|
||||
parser = all_parsers
|
||||
data = "A,B,B\nX,Y,Z\n1,2,3"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=list(range(2)),
|
||||
converters={
|
||||
("A", "X"): np.int32,
|
||||
("B", "Y"): np.int32,
|
||||
("B", "Z"): np.float32,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=list(range(2)),
|
||||
converters={
|
||||
("A", "X"): np.int32,
|
||||
("B", "Y"): np.int32,
|
||||
("B", "Z"): np.float32,
|
||||
},
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
("A", "X"): np.int32([1]),
|
||||
("B", "Y"): np.int32([2]),
|
||||
("B", "Z"): np.float32([3]),
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,195 @@
|
||||
"""
|
||||
Tests that dialects are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def custom_dialect():
|
||||
dialect_name = "weird"
|
||||
dialect_kwargs = {
|
||||
"doublequote": False,
|
||||
"escapechar": "~",
|
||||
"delimiter": ":",
|
||||
"skipinitialspace": False,
|
||||
"quotechar": "`",
|
||||
"quoting": 3,
|
||||
}
|
||||
return dialect_name, dialect_kwargs
|
||||
|
||||
|
||||
def test_dialect(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,"a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
|
||||
dia = csv.excel()
|
||||
dia.quoting = csv.QUOTE_NONE
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dialect=dia)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(data), dialect=dia)
|
||||
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
exp = parser.read_csv(StringIO(data))
|
||||
exp.replace("a", '"a', inplace=True)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
|
||||
def test_dialect_str(all_parsers):
|
||||
dialect_name = "mydialect"
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
fruit:vegetable
|
||||
apple:broccoli
|
||||
pear:tomato
|
||||
"""
|
||||
exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]})
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, delimiter=":"):
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dialect=dialect_name)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(data), dialect=dialect_name)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
|
||||
def test_invalid_dialect(all_parsers):
|
||||
class InvalidDialect:
|
||||
pass
|
||||
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
msg = "Invalid dialect"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dialect=InvalidDialect)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg",
|
||||
[None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"],
|
||||
)
|
||||
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
|
||||
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value):
|
||||
# see gh-23761.
|
||||
dialect_name, dialect_kwargs = custom_dialect
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
data = "a:b\n1:2"
|
||||
|
||||
warning_klass = None
|
||||
kwds = {}
|
||||
|
||||
# arg=None tests when we pass in the dialect without any other arguments.
|
||||
if arg is not None:
|
||||
if value == "dialect": # No conflict --> no warning.
|
||||
kwds[arg] = dialect_kwargs[arg]
|
||||
elif value == "default": # Default --> no warning.
|
||||
from pandas.io.parsers.base_parser import parser_defaults
|
||||
|
||||
kwds[arg] = parser_defaults[arg]
|
||||
else: # Non-default + conflict with dialect --> warning.
|
||||
warning_klass = ParserWarning
|
||||
kwds[arg] = "blah"
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv_check_warnings(
|
||||
# No warning bc we raise
|
||||
None,
|
||||
"Conflicting values for",
|
||||
StringIO(data),
|
||||
dialect=dialect_name,
|
||||
**kwds,
|
||||
)
|
||||
return
|
||||
result = parser.read_csv_check_warnings(
|
||||
warning_klass,
|
||||
"Conflicting values for",
|
||||
StringIO(data),
|
||||
dialect=dialect_name,
|
||||
**kwds,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,warning_klass",
|
||||
[
|
||||
({"sep": ","}, None), # sep is default --> sep_override=True
|
||||
({"sep": "."}, ParserWarning), # sep isn't default --> sep_override=False
|
||||
({"delimiter": ":"}, None), # No conflict
|
||||
({"delimiter": None}, None), # Default arguments --> sep_override=True
|
||||
({"delimiter": ","}, ParserWarning), # Conflict
|
||||
({"delimiter": "."}, ParserWarning), # Conflict
|
||||
],
|
||||
ids=[
|
||||
"sep-override-true",
|
||||
"sep-override-false",
|
||||
"delimiter-no-conflict",
|
||||
"delimiter-default-arg",
|
||||
"delimiter-conflict",
|
||||
"delimiter-conflict2",
|
||||
],
|
||||
)
|
||||
def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass):
|
||||
# see gh-23761.
|
||||
dialect_name, dialect_kwargs = custom_dialect
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
data = "a:b\n1:2"
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'dialect' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv_check_warnings(
|
||||
# no warning bc we raise
|
||||
None,
|
||||
"Conflicting values for 'delimiter'",
|
||||
StringIO(data),
|
||||
dialect=dialect_name,
|
||||
**kwargs,
|
||||
)
|
||||
return
|
||||
result = parser.read_csv_check_warnings(
|
||||
warning_klass,
|
||||
"Conflicting values for 'delimiter'",
|
||||
StringIO(data),
|
||||
dialect=dialect_name,
|
||||
**kwargs,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,337 @@
|
||||
"""
|
||||
Tests encoding functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import (
|
||||
BytesIO,
|
||||
TextIOWrapper,
|
||||
)
|
||||
import os
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
read_csv,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_bytes_io_input(all_parsers):
|
||||
encoding = "cp1255"
|
||||
parser = all_parsers
|
||||
|
||||
data = BytesIO("שלום:1234\n562:123".encode(encoding))
|
||||
result = parser.read_csv(data, sep=":", encoding=encoding)
|
||||
|
||||
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_read_csv_unicode(all_parsers):
|
||||
parser = all_parsers
|
||||
data = BytesIO("\u0141aski, Jan;1".encode())
|
||||
|
||||
result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
|
||||
expected = DataFrame([["\u0141aski, Jan", 1]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize("sep", [",", "\t"])
|
||||
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
|
||||
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
|
||||
# see gh-2298
|
||||
parser = all_parsers
|
||||
data = """skip this
|
||||
skip this too
|
||||
A,B,C
|
||||
1,2,3
|
||||
4,5,6""".replace(
|
||||
",", sep
|
||||
)
|
||||
path = f"__{uuid.uuid4()}__.csv"
|
||||
kwargs = {"sep": sep, "skiprows": 2}
|
||||
utf8 = "utf-8"
|
||||
|
||||
with tm.ensure_clean(path) as path:
|
||||
bytes_data = data.encode(encoding)
|
||||
|
||||
with open(path, "wb") as f:
|
||||
f.write(bytes_data)
|
||||
|
||||
with TextIOWrapper(BytesIO(data.encode(utf8)), encoding=utf8) as bytes_buffer:
|
||||
result = parser.read_csv(path, encoding=encoding, **kwargs)
|
||||
expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_utf16_example(all_parsers, csv_dir_path):
|
||||
path = os.path.join(csv_dir_path, "utf16_ex.txt")
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(path, encoding="utf-16", sep="\t")
|
||||
assert len(result) == 50
|
||||
|
||||
|
||||
def test_unicode_encoding(all_parsers, csv_dir_path):
|
||||
path = os.path.join(csv_dir_path, "unicode_series.csv")
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(path, header=None, encoding="latin-1")
|
||||
result = result.set_index(0)
|
||||
got = result[1][1632]
|
||||
|
||||
expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)"
|
||||
assert got == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
# Basic test
|
||||
("a\n1", {}, DataFrame({"a": [1]})),
|
||||
# "Regular" quoting
|
||||
('"a"\n1', {"quotechar": '"'}, DataFrame({"a": [1]})),
|
||||
# Test in a data row instead of header
|
||||
("b\n1", {"names": ["a"]}, DataFrame({"a": ["b", "1"]})),
|
||||
# Test in empty data row with skipping
|
||||
("\n1", {"names": ["a"], "skip_blank_lines": True}, DataFrame({"a": [1]})),
|
||||
# Test in empty data row without skipping
|
||||
(
|
||||
"\n1",
|
||||
{"names": ["a"], "skip_blank_lines": False},
|
||||
DataFrame({"a": [np.nan, 1]}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_utf8_bom(all_parsers, data, kwargs, expected, request):
|
||||
# see gh-4793
|
||||
parser = all_parsers
|
||||
bom = "\ufeff"
|
||||
utf8 = "utf-8"
|
||||
|
||||
def _encode_data_with_bom(_data):
|
||||
bom_data = (bom + _data).encode(utf8)
|
||||
return BytesIO(bom_data)
|
||||
|
||||
if (
|
||||
parser.engine == "pyarrow"
|
||||
and data == "\n1"
|
||||
and kwargs.get("skip_blank_lines", True)
|
||||
):
|
||||
# CSV parse error: Empty CSV file or block: cannot infer number of columns
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
|
||||
# see gh-13549
|
||||
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
|
||||
parser = all_parsers
|
||||
|
||||
encoding = encoding_fmt.format(utf_value)
|
||||
data = "mb_num,multibyte\n4.8,test".encode(encoding)
|
||||
|
||||
result = parser.read_csv(BytesIO(data), encoding=encoding)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file_path,encoding",
|
||||
[
|
||||
(("io", "data", "csv", "test1.csv"), "utf-8"),
|
||||
(("io", "parser", "data", "unicode_series.csv"), "latin-1"),
|
||||
(("io", "parser", "data", "sauron.SHIFT_JIS.csv"), "shiftjis"),
|
||||
],
|
||||
)
|
||||
def test_binary_mode_file_buffers(all_parsers, file_path, encoding, datapath):
|
||||
# gh-23779: Python csv engine shouldn't error on files opened in binary.
|
||||
# gh-31575: Python csv engine shouldn't error on files opened in raw binary.
|
||||
parser = all_parsers
|
||||
|
||||
fpath = datapath(*file_path)
|
||||
expected = parser.read_csv(fpath, encoding=encoding)
|
||||
|
||||
with open(fpath, encoding=encoding) as fa:
|
||||
result = parser.read_csv(fa)
|
||||
assert not fa.closed
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
with open(fpath, mode="rb") as fb:
|
||||
result = parser.read_csv(fb, encoding=encoding)
|
||||
assert not fb.closed
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
with open(fpath, mode="rb", buffering=0) as fb:
|
||||
result = parser.read_csv(fb, encoding=encoding)
|
||||
assert not fb.closed
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pass_encoding", [True, False])
|
||||
def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
|
||||
# see gh-24130
|
||||
parser = all_parsers
|
||||
encoding = encoding_fmt.format(utf_value)
|
||||
|
||||
if parser.engine == "pyarrow" and pass_encoding is True and utf_value in [16, 32]:
|
||||
# FIXME: this is bad!
|
||||
pytest.skip("These cases freeze")
|
||||
|
||||
expected = DataFrame({"foo": ["bar"]})
|
||||
|
||||
with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f:
|
||||
f.write("foo\nbar")
|
||||
f.seek(0)
|
||||
|
||||
result = parser.read_csv(f, encoding=encoding if pass_encoding else None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_encoding_named_temp_file(all_parsers):
|
||||
# see gh-31819
|
||||
parser = all_parsers
|
||||
encoding = "shift-jis"
|
||||
|
||||
title = "てすと"
|
||||
data = "こむ"
|
||||
|
||||
expected = DataFrame({title: [data]})
|
||||
|
||||
with tempfile.NamedTemporaryFile() as f:
|
||||
f.write(f"{title}\n{data}".encode(encoding))
|
||||
|
||||
f.seek(0)
|
||||
|
||||
result = parser.read_csv(f, encoding=encoding)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert not f.closed
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"]
|
||||
)
|
||||
def test_parse_encoded_special_characters(encoding):
|
||||
# GH16218 Verify parsing of data with encoded special characters
|
||||
# Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a")
|
||||
data = "a\tb\n:foo\t0\nbar\t1\nbaz\t2" # noqa: RUF001
|
||||
encoded_data = BytesIO(data.encode(encoding))
|
||||
result = read_csv(encoded_data, delimiter="\t", encoding=encoding)
|
||||
|
||||
expected = DataFrame(
|
||||
data=[[":foo", 0], ["bar", 1], ["baz", 2]], # noqa: RUF001
|
||||
columns=["a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
|
||||
def test_encoding_memory_map(all_parsers, encoding):
|
||||
# GH40986
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
{
|
||||
"name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"],
|
||||
"mask": ["red", "purple", "orange", "blue"],
|
||||
"weapon": ["sai", "bo staff", "nunchunk", "katana"],
|
||||
}
|
||||
)
|
||||
with tm.ensure_clean() as file:
|
||||
expected.to_csv(file, index=False, encoding=encoding)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(file, encoding=encoding, memory_map=True)
|
||||
return
|
||||
|
||||
df = parser.read_csv(file, encoding=encoding, memory_map=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_chunk_splits_multibyte_char(all_parsers):
|
||||
"""
|
||||
Chunk splits a multibyte character with memory_map=True
|
||||
|
||||
GH 43540
|
||||
"""
|
||||
parser = all_parsers
|
||||
# DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
|
||||
df = DataFrame(data=["a" * 127] * 2048)
|
||||
|
||||
# Put two-bytes utf-8 encoded character "ą" at the end of chunk
|
||||
# utf-8 encoding of "ą" is b'\xc4\x85'
|
||||
df.iloc[2047] = "a" * 127 + "ą"
|
||||
with tm.ensure_clean("bug-gh43540.csv") as fname:
|
||||
df.to_csv(fname, index=False, header=False, encoding="utf-8")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(fname, header=None, memory_map=True)
|
||||
return
|
||||
|
||||
dfr = parser.read_csv(fname, header=None, memory_map=True)
|
||||
tm.assert_frame_equal(dfr, df)
|
||||
|
||||
|
||||
def test_readcsv_memmap_utf8(all_parsers):
|
||||
"""
|
||||
GH 43787
|
||||
|
||||
Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8
|
||||
"""
|
||||
lines = []
|
||||
line_length = 128
|
||||
start_char = " "
|
||||
end_char = "\U00010080"
|
||||
# This for loop creates a list of 128-char strings
|
||||
# consisting of consecutive Unicode chars
|
||||
for lnum in range(ord(start_char), ord(end_char), line_length):
|
||||
line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
|
||||
try:
|
||||
line.encode("utf-8")
|
||||
except UnicodeEncodeError:
|
||||
continue
|
||||
lines.append(line)
|
||||
parser = all_parsers
|
||||
df = DataFrame(lines)
|
||||
with tm.ensure_clean("utf8test.csv") as fname:
|
||||
df.to_csv(fname, index=False, header=False, encoding="utf-8")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
|
||||
return
|
||||
|
||||
dfr = parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
|
||||
tm.assert_frame_equal(df, dfr)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("pyarrow_xfail")
|
||||
@pytest.mark.parametrize("mode", ["w+b", "w+t"])
|
||||
def test_not_readable(all_parsers, mode):
|
||||
# GH43439
|
||||
parser = all_parsers
|
||||
content = b"abcd"
|
||||
if "t" in mode:
|
||||
content = "abcd"
|
||||
with tempfile.SpooledTemporaryFile(mode=mode, encoding="utf-8") as handle:
|
||||
handle.write(content)
|
||||
handle.seek(0)
|
||||
df = parser.read_csv(handle)
|
||||
expected = DataFrame([], columns=["abcd"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
@@ -0,0 +1,733 @@
|
||||
"""
|
||||
Tests that the file header is properly handled or inferred
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from collections import namedtuple
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_read_with_bad_header(all_parsers):
|
||||
parser = all_parsers
|
||||
msg = r"but only \d+ lines in file"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
s = StringIO(",,")
|
||||
parser.read_csv(s, header=[10])
|
||||
|
||||
|
||||
def test_negative_header(all_parsers):
|
||||
# see gh-27779
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="Passing negative integer to header is invalid. "
|
||||
"For no header, use header=None instead",
|
||||
):
|
||||
parser.read_csv(StringIO(data), header=-1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])])
|
||||
def test_negative_multi_index_header(all_parsers, header):
|
||||
# see gh-27779
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
with pytest.raises(
|
||||
ValueError, match="cannot specify multi-index header with negative integers"
|
||||
):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [True, False])
|
||||
def test_bool_header_arg(all_parsers, header):
|
||||
# see gh-6114
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b"""
|
||||
msg = "Passing a bool to header is invalid"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame are different
|
||||
def test_header_with_index_col(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
names = ["A", "B", "C"]
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_not_first_line(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """got,to,ignore,this,line
|
||||
got,to,ignore,this,line
|
||||
index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
data2 = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=2, index_col=0)
|
||||
expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1])
|
||||
data_gen_f = lambda r, c: f"R{r}C{c}"
|
||||
|
||||
data = [[data_gen_f(r, c) for c in range(3)] for r in range(5)]
|
||||
index = MultiIndex.from_arrays(
|
||||
[[f"R_l0_g{i}" for i in range(5)], [f"R_l1_g{i}" for i in range(5)]],
|
||||
names=["R0", "R1"],
|
||||
)
|
||||
columns = MultiIndex.from_arrays(
|
||||
[
|
||||
[f"C_l0_g{i}" for i in range(3)],
|
||||
[f"C_l1_g{i}" for i in range(3)],
|
||||
[f"C_l2_g{i}" for i in range(3)],
|
||||
[f"C_l3_g{i}" for i in range(3)],
|
||||
],
|
||||
names=["C0", "C1", "C2", "C3"],
|
||||
)
|
||||
expected = DataFrame(data, columns=columns, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
(
|
||||
{"index_col": ["foo", "bar"]},
|
||||
(
|
||||
"index_col must only contain "
|
||||
"row numbers when specifying "
|
||||
"a multi-index header"
|
||||
),
|
||||
),
|
||||
(
|
||||
{"index_col": [0, 1], "names": ["foo", "bar"]},
|
||||
("cannot specify names when specifying a multi-index header"),
|
||||
),
|
||||
(
|
||||
{"index_col": [0, 1], "usecols": ["foo", "bar"]},
|
||||
("cannot specify usecols when specifying a multi-index header"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_invalid(all_parsers, kwargs, msg):
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
|
||||
|
||||
|
||||
_TestTuple = namedtuple("_TestTuple", ["first", "second"])
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"header": [0, 1]},
|
||||
{
|
||||
"skiprows": 3,
|
||||
"names": [
|
||||
("a", "q"),
|
||||
("a", "r"),
|
||||
("a", "s"),
|
||||
("b", "t"),
|
||||
("c", "u"),
|
||||
("c", "v"),
|
||||
],
|
||||
},
|
||||
{
|
||||
"skiprows": 3,
|
||||
"names": [
|
||||
_TestTuple("a", "q"),
|
||||
_TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"),
|
||||
_TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"),
|
||||
_TestTuple("c", "v"),
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_common_format1(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
|
||||
),
|
||||
)
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
,,,,,,
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"header": [0, 1]},
|
||||
{
|
||||
"skiprows": 2,
|
||||
"names": [
|
||||
("a", "q"),
|
||||
("a", "r"),
|
||||
("a", "s"),
|
||||
("b", "t"),
|
||||
("c", "u"),
|
||||
("c", "v"),
|
||||
],
|
||||
},
|
||||
{
|
||||
"skiprows": 2,
|
||||
"names": [
|
||||
_TestTuple("a", "q"),
|
||||
_TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"),
|
||||
_TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"),
|
||||
_TestTuple("c", "v"),
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_common_format2(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
|
||||
),
|
||||
)
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"header": [0, 1]},
|
||||
{
|
||||
"skiprows": 2,
|
||||
"names": [
|
||||
("a", "q"),
|
||||
("a", "r"),
|
||||
("a", "s"),
|
||||
("b", "t"),
|
||||
("c", "u"),
|
||||
("c", "v"),
|
||||
],
|
||||
},
|
||||
{
|
||||
"skiprows": 2,
|
||||
"names": [
|
||||
_TestTuple("a", "q"),
|
||||
_TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"),
|
||||
_TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"),
|
||||
_TestTuple("c", "v"),
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_common_format3(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
|
||||
),
|
||||
)
|
||||
expected = expected.reset_index(drop=True)
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index_common_format_malformed1(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(
|
||||
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
|
||||
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=["a", "q"],
|
||||
),
|
||||
)
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index_common_format_malformed2(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(
|
||||
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
|
||||
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=[None, "q"],
|
||||
),
|
||||
)
|
||||
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index_common_format_malformed3(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
|
||||
index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]),
|
||||
columns=MultiIndex(
|
||||
levels=[["a", "b", "c"], ["s", "t", "u", "v"]],
|
||||
codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
|
||||
names=[None, "q"],
|
||||
),
|
||||
)
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_multi_index_blank_line(all_parsers):
|
||||
# GH 40442
|
||||
parser = all_parsers
|
||||
data = [[None, None], [1, 2], [3, 4]]
|
||||
columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")])
|
||||
expected = DataFrame(data, columns=columns)
|
||||
data = "a,b\nA,B\n,\n1,2\n3,4"
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1])
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)]
|
||||
)
|
||||
def test_header_names_backward_compat(all_parsers, data, header, request):
|
||||
# see gh-2539
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and header is not None:
|
||||
mark = pytest.mark.xfail(reason="DataFrame.columns are different")
|
||||
request.applymarker(mark)
|
||||
|
||||
expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block: cannot infer
|
||||
@pytest.mark.parametrize("kwargs", [{}, {"index_col": False}])
|
||||
def test_read_only_header_no_rows(all_parsers, kwargs):
|
||||
# See gh-7773
|
||||
parser = all_parsers
|
||||
expected = DataFrame(columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO("a,b,c"), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,names",
|
||||
[
|
||||
({}, [0, 1, 2, 3, 4]),
|
||||
(
|
||||
{"names": ["foo", "bar", "baz", "quux", "panda"]},
|
||||
["foo", "bar", "baz", "quux", "panda"],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_no_header(all_parsers, kwargs, names):
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [["a", "b"], "string_header"])
|
||||
def test_non_int_header(all_parsers, header):
|
||||
# see gh-16338
|
||||
msg = "header must be integer or list of integers"
|
||||
data = """1,2\n3,4"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_singleton_header(all_parsers):
|
||||
# see gh-7757
|
||||
data = """a,b,c\n0,1,2\n1,2,3"""
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
|
||||
result = parser.read_csv(StringIO(data), header=[0])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
(
|
||||
"A,A,A,B\none,one,one,two\n0,40,34,0.1",
|
||||
DataFrame(
|
||||
[[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")]
|
||||
),
|
||||
),
|
||||
),
|
||||
(
|
||||
"A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
|
||||
DataFrame(
|
||||
[[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")]
|
||||
),
|
||||
),
|
||||
),
|
||||
(
|
||||
"A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
|
||||
DataFrame(
|
||||
[[0, 40, 34, 0.1, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[
|
||||
("A", "one"),
|
||||
("A", "one.1"),
|
||||
("A", "one.1.1"),
|
||||
("B", "two"),
|
||||
("B", "two.1"),
|
||||
]
|
||||
),
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_mangles_multi_index(all_parsers, data, expected):
|
||||
# see gh-18062
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is requireds
|
||||
@pytest.mark.parametrize("index_col", [None, [0]])
|
||||
@pytest.mark.parametrize(
|
||||
"columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
|
||||
)
|
||||
def test_multi_index_unnamed(all_parsers, index_col, columns):
|
||||
# see gh-23687
|
||||
#
|
||||
# When specifying a multi-index header, make sure that
|
||||
# we don't error just because one of the rows in our header
|
||||
# has ALL column names containing the string "Unnamed". The
|
||||
# correct condition to check is whether the row contains
|
||||
# ALL columns that did not have names (and instead were given
|
||||
# placeholder ones).
|
||||
parser = all_parsers
|
||||
header = [0, 1]
|
||||
|
||||
if index_col is None:
|
||||
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
|
||||
else:
|
||||
data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=header, index_col=index_col)
|
||||
exp_columns = []
|
||||
|
||||
if columns is None:
|
||||
columns = ["", "", ""]
|
||||
|
||||
for i, col in enumerate(columns):
|
||||
if not col: # Unnamed.
|
||||
col = f"Unnamed: {i if index_col is None else i + 1}_level_0"
|
||||
|
||||
exp_columns.append(col)
|
||||
|
||||
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
|
||||
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 2 columns, got 3
|
||||
def test_names_longer_than_header_but_equal_with_data_rows(all_parsers):
|
||||
# GH#38453
|
||||
parser = all_parsers
|
||||
data = """a, b
|
||||
1,2,3
|
||||
5,6,4
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=0, names=["A", "B", "C"])
|
||||
expected = DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_read_csv_multiindex_columns(all_parsers):
|
||||
# GH#6051
|
||||
parser = all_parsers
|
||||
|
||||
s1 = "Male, Male, Male, Female, Female\nR, R, L, R, R\n.86, .67, .88, .78, .81"
|
||||
s2 = (
|
||||
"Male, Male, Male, Female, Female\n"
|
||||
"R, R, L, R, R\n"
|
||||
".86, .67, .88, .78, .81\n"
|
||||
".86, .67, .88, .78, .82"
|
||||
)
|
||||
|
||||
mi = MultiIndex.from_tuples(
|
||||
[
|
||||
("Male", "R"),
|
||||
(" Male", " R"),
|
||||
(" Male", " L"),
|
||||
(" Female", " R"),
|
||||
(" Female", " R.1"),
|
||||
]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[0.86, 0.67, 0.88, 0.78, 0.81], [0.86, 0.67, 0.88, 0.78, 0.82]], columns=mi
|
||||
)
|
||||
|
||||
df1 = parser.read_csv(StringIO(s1), header=[0, 1])
|
||||
tm.assert_frame_equal(df1, expected.iloc[:1])
|
||||
df2 = parser.read_csv(StringIO(s2), header=[0, 1])
|
||||
tm.assert_frame_equal(df2, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_read_csv_multi_header_length_check(all_parsers):
|
||||
# GH#43102
|
||||
parser = all_parsers
|
||||
|
||||
case = """row11,row12,row13
|
||||
row21,row22, row23
|
||||
row31,row32
|
||||
"""
|
||||
|
||||
with pytest.raises(
|
||||
ParserError, match="Header rows must have an equal number of columns."
|
||||
):
|
||||
parser.read_csv(StringIO(case), header=[0, 2])
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 3 columns, got 2
|
||||
def test_header_none_and_implicit_index(all_parsers):
|
||||
# GH#22144
|
||||
parser = all_parsers
|
||||
data = "x,1,5\ny,2\nz,3\n"
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b"], header=None)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3], "b": [5, np.nan, np.nan]}, index=["x", "y", "z"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got "
|
||||
def test_header_none_and_implicit_index_in_second_row(all_parsers):
|
||||
# GH#22144
|
||||
parser = all_parsers
|
||||
data = "x,1\ny,2,5\nz,3\n"
|
||||
with pytest.raises(ParserError, match="Expected 2 fields in line 2, saw 3"):
|
||||
parser.read_csv(StringIO(data), names=["a", "b"], header=None)
|
||||
|
||||
|
||||
def test_header_none_and_on_bad_lines_skip(all_parsers):
|
||||
# GH#22144
|
||||
parser = all_parsers
|
||||
data = "x,1\ny,2,5\nz,3\n"
|
||||
result = parser.read_csv(
|
||||
StringIO(data), names=["a", "b"], header=None, on_bad_lines="skip"
|
||||
)
|
||||
expected = DataFrame({"a": ["x", "z"], "b": [1, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is requireds
|
||||
def test_header_missing_rows(all_parsers):
|
||||
# GH#47400
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,2
|
||||
"""
|
||||
msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=[0, 1, 2])
|
||||
|
||||
|
||||
# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
|
||||
@xfail_pyarrow
|
||||
def test_header_multiple_whitespaces(all_parsers):
|
||||
# GH#54931
|
||||
parser = all_parsers
|
||||
data = """aa bb(1,1) cc(1,1)
|
||||
0 2 3.5"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
|
||||
@xfail_pyarrow
|
||||
def test_header_delim_whitespace(all_parsers):
|
||||
# GH#54918
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,2
|
||||
3,4
|
||||
"""
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data), delim_whitespace=True)
|
||||
expected = DataFrame({"a,b": ["1,2", "3,4"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_no_header_pyarrow(pyarrow_parser_only):
|
||||
parser = pyarrow_parser_only
|
||||
data = """
|
||||
a,i,x
|
||||
b,j,y
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
usecols=[0, 1],
|
||||
dtype="string[pyarrow]",
|
||||
dtype_backend="pyarrow",
|
||||
engine="pyarrow",
|
||||
)
|
||||
expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,376 @@
|
||||
"""
|
||||
Tests that the specified index column (a.k.a "index_col")
|
||||
is properly handled or inferred during parsing for all of
|
||||
the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_header", [True, False])
|
||||
def test_index_col_named(all_parsers, with_header):
|
||||
parser = all_parsers
|
||||
no_header = """\
|
||||
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
|
||||
header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
|
||||
|
||||
if with_header:
|
||||
data = header + no_header
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col="ID")
|
||||
expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
data = no_header
|
||||
msg = "Index ID invalid"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col="ID")
|
||||
|
||||
|
||||
def test_index_col_named2(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
1,2,3,4,hello
|
||||
5,6,7,8,world
|
||||
9,10,11,12,foo
|
||||
"""
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]},
|
||||
index=Index(["hello", "world", "foo"], name="message"),
|
||||
)
|
||||
names = ["a", "b", "c", "d", "message"]
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, index_col=["message"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_col_is_true(all_parsers):
|
||||
# see gh-9798
|
||||
data = "a,b\n1,2"
|
||||
parser = all_parsers
|
||||
|
||||
msg = "The value of index_col couldn't be 'True'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col=True)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
|
||||
def test_infer_index_col(all_parsers):
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize(
|
||||
"index_col,kwargs",
|
||||
[
|
||||
(None, {"columns": ["x", "y", "z"]}),
|
||||
(False, {"columns": ["x", "y", "z"]}),
|
||||
(0, {"columns": ["y", "z"], "index": Index([], name="x")}),
|
||||
(1, {"columns": ["x", "z"], "index": Index([], name="y")}),
|
||||
("x", {"columns": ["y", "z"], "index": Index([], name="x")}),
|
||||
("y", {"columns": ["x", "z"], "index": Index([], name="y")}),
|
||||
(
|
||||
[0, 1],
|
||||
{
|
||||
"columns": ["z"],
|
||||
"index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
|
||||
},
|
||||
),
|
||||
(
|
||||
["x", "y"],
|
||||
{
|
||||
"columns": ["z"],
|
||||
"index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
|
||||
},
|
||||
),
|
||||
(
|
||||
[1, 0],
|
||||
{
|
||||
"columns": ["z"],
|
||||
"index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
|
||||
},
|
||||
),
|
||||
(
|
||||
["y", "x"],
|
||||
{
|
||||
"columns": ["z"],
|
||||
"index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_index_col_empty_data(all_parsers, index_col, kwargs):
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=index_col)
|
||||
|
||||
expected = DataFrame(**kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_empty_with_index_col_false(all_parsers):
|
||||
# see gh-10413
|
||||
data = "x,y"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=False)
|
||||
|
||||
expected = DataFrame(columns=["x", "y"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_names",
|
||||
[
|
||||
["", ""],
|
||||
["foo", ""],
|
||||
["", "bar"],
|
||||
["foo", "bar"],
|
||||
["NotReallyUnnamed", "Unnamed: 0"],
|
||||
],
|
||||
)
|
||||
def test_multi_index_naming(all_parsers, index_names, request):
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and "" in index_names:
|
||||
mark = pytest.mark.xfail(reason="One case raises, others are wrong")
|
||||
request.applymarker(mark)
|
||||
|
||||
# We don't want empty index names being replaced with "Unnamed: 0"
|
||||
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 1])
|
||||
|
||||
expected = DataFrame(
|
||||
{"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]])
|
||||
)
|
||||
expected.index.names = [name if name else None for name in index_names]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_multi_index_naming_not_all_at_beginning(all_parsers):
|
||||
parser = all_parsers
|
||||
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 2])
|
||||
|
||||
expected = DataFrame(
|
||||
{"Unnamed: 2": ["c", "d", "c", "d"]},
|
||||
index=MultiIndex(
|
||||
levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_no_multi_index_level_names_empty(all_parsers):
|
||||
# GH 10984
|
||||
parser = all_parsers
|
||||
midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)])
|
||||
expected = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((3, 3)),
|
||||
index=midx,
|
||||
columns=["x", "y", "z"],
|
||||
)
|
||||
with tm.ensure_clean() as path:
|
||||
expected.to_csv(path)
|
||||
result = parser.read_csv(path, index_col=[0, 1, 2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_header_with_index_col(all_parsers):
|
||||
# GH 33476
|
||||
parser = all_parsers
|
||||
data = """
|
||||
I11,A,A
|
||||
I12,B,B
|
||||
I2,1,3
|
||||
"""
|
||||
midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
|
||||
idx = Index(["I2"])
|
||||
expected = DataFrame([[1, 3]], index=idx, columns=midx)
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
col_idx = Index(["A", "A.1"])
|
||||
idx = Index(["I12", "I2"], name="I11")
|
||||
expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx)
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col="I11", header=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_index_col_large_csv(all_parsers, monkeypatch):
|
||||
# https://github.com/pandas-dev/pandas/issues/37094
|
||||
parser = all_parsers
|
||||
|
||||
ARR_LEN = 100
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": range(ARR_LEN + 1),
|
||||
"b": np.random.default_rng(2).standard_normal(ARR_LEN + 1),
|
||||
}
|
||||
)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_csv(path, index=False)
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr("pandas.core.algorithms._MINIMUM_COMP_ARR_LEN", ARR_LEN)
|
||||
result = parser.read_csv(path, index_col=[0])
|
||||
|
||||
tm.assert_frame_equal(result, df.set_index("a"))
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_index_col_multiindex_columns_no_data(all_parsers):
|
||||
# GH#38292
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1], index_col=0
|
||||
)
|
||||
expected = DataFrame(
|
||||
[],
|
||||
index=Index([]),
|
||||
columns=MultiIndex.from_arrays(
|
||||
[["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_index_col_header_no_data(all_parsers):
|
||||
# GH#38292
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO("a0,a1,a2\n"), header=[0], index_col=0)
|
||||
expected = DataFrame(
|
||||
[],
|
||||
columns=["a1", "a2"],
|
||||
index=Index([], name="a0"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_multiindex_columns_no_data(all_parsers):
|
||||
# GH#38292
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1])
|
||||
expected = DataFrame(
|
||||
[], columns=MultiIndex.from_arrays([["a0", "a1", "a2"], ["b0", "b1", "b2"]])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_multiindex_columns_index_col_with_data(all_parsers):
|
||||
# GH#38292
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO("a0,a1,a2\nb0,b1,b2\ndata,data,data"), header=[0, 1], index_col=0
|
||||
)
|
||||
expected = DataFrame(
|
||||
[["data", "data"]],
|
||||
columns=MultiIndex.from_arrays(
|
||||
[["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
|
||||
),
|
||||
index=Index(["data"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
def test_infer_types_boolean_sum(all_parsers):
|
||||
# GH#44079
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO("0,1"),
|
||||
names=["a", "b"],
|
||||
index_col=["a"],
|
||||
dtype={"a": "UInt8"},
|
||||
)
|
||||
expected = DataFrame(
|
||||
data={
|
||||
"a": [
|
||||
0,
|
||||
],
|
||||
"b": [1],
|
||||
}
|
||||
).set_index("a")
|
||||
# Not checking index type now, because the C parser will return a
|
||||
# index column of dtype 'object', and the Python parser will return a
|
||||
# index column of dtype 'int64'.
|
||||
tm.assert_frame_equal(result, expected, check_index_type=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
|
||||
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
|
||||
# GH#9435
|
||||
data = "a,b\n01,2"
|
||||
parser = all_parsers
|
||||
if dtype == object and parser.engine == "pyarrow":
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
|
||||
expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
def test_multiindex_columns_not_leading_index_col(all_parsers):
|
||||
# GH#38549
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
e,f,g,h
|
||||
x,y,1,2
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=[0, 1],
|
||||
index_col=1,
|
||||
)
|
||||
cols = MultiIndex.from_tuples(
|
||||
[("a", "e"), ("c", "g"), ("d", "h")], names=["b", "f"]
|
||||
)
|
||||
expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,182 @@
|
||||
"""
|
||||
Tests that duplicate columns are handled appropriately when parsed by the
|
||||
CSV engine. In general, the expected result is that they are either thoroughly
|
||||
de-duplicated (if mangling requested) or ignored otherwise.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_basic(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,a,b,b,b\n1,2,3,4,5"
|
||||
result = parser.read_csv(StringIO(data), sep=",")
|
||||
|
||||
expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_basic_names(all_parsers):
|
||||
# See gh-7160
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,b,a\n0,1,2\n3,4,5"
|
||||
expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"])
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_basic_names_raise(all_parsers):
|
||||
# See gh-7160
|
||||
parser = all_parsers
|
||||
|
||||
data = "0,1,2\n3,4,5"
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
parser.read_csv(StringIO(data), names=["a", "b", "a"])
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])),
|
||||
(
|
||||
"a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
|
||||
DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6]],
|
||||
columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
|
||||
DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6, 7]],
|
||||
columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_thorough_mangle_columns(all_parsers, data, expected):
|
||||
# see gh-17060
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,names,expected",
|
||||
[
|
||||
(
|
||||
"a,b,b\n1,2,3",
|
||||
["a.1", "a.1", "a.1.1"],
|
||||
DataFrame(
|
||||
[["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"]
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,b,c,d,e,f\n1,2,3,4,5,6",
|
||||
["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
|
||||
DataFrame(
|
||||
[["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]],
|
||||
columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
|
||||
["a", "a", "a.3", "a.1", "a.2", "a", "a"],
|
||||
DataFrame(
|
||||
[
|
||||
["a", "b", "c", "d", "e", "f", "g"],
|
||||
["1", "2", "3", "4", "5", "6", "7"],
|
||||
],
|
||||
columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_thorough_mangle_names(all_parsers, data, names, expected):
|
||||
# see gh-17095
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.columns are different
|
||||
def test_mangled_unnamed_placeholders(all_parsers):
|
||||
# xref gh-13017
|
||||
orig_key = "0"
|
||||
parser = all_parsers
|
||||
|
||||
orig_value = [1, 2, 3]
|
||||
df = DataFrame({orig_key: orig_value})
|
||||
|
||||
# This test recursively updates `df`.
|
||||
for i in range(3):
|
||||
expected = DataFrame(columns=Index([], dtype="str"))
|
||||
|
||||
for j in range(i + 1):
|
||||
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
|
||||
expected.insert(loc=0, column=col_name, value=[0, 1, 2])
|
||||
|
||||
expected[orig_key] = orig_value
|
||||
df = parser.read_csv(StringIO(df.to_csv()))
|
||||
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_mangle_dupe_cols_already_exists(all_parsers):
|
||||
# GH#14704
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7"
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6, 7]],
|
||||
columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
|
||||
# GH#14704
|
||||
parser = all_parsers
|
||||
|
||||
data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4"
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4]],
|
||||
columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecol, engine", [([0, 1, 1], "python"), ([0, 1, 1], "c")])
|
||||
def test_mangle_cols_names(all_parsers, usecol, engine):
|
||||
# GH 11823
|
||||
parser = all_parsers
|
||||
data = "1,2,3"
|
||||
names = ["A", "A", "B"]
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
parser.read_csv(StringIO(data), names=names, usecols=usecol, engine=engine)
|
||||
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
Tests multithreading behaviour for reading and
|
||||
parsing files for each parser defined in parsers.py
|
||||
"""
|
||||
from contextlib import ExitStack
|
||||
from io import BytesIO
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
from pandas.util.version import Version
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
# We'll probably always skip these for pyarrow
|
||||
# Maybe we'll add our own tests for pyarrow too
|
||||
pytestmark = [
|
||||
pytest.mark.single_cpu,
|
||||
pytest.mark.slow,
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
|
||||
def test_multi_thread_string_io_read_csv(all_parsers, request):
|
||||
# see gh-11786
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
if Version(pa.__version__) < Version("16.0"):
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason="# ValueError: Found non-unique column index")
|
||||
)
|
||||
max_row_range = 100
|
||||
num_files = 10
|
||||
|
||||
bytes_to_df = (
|
||||
"\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode()
|
||||
for _ in range(num_files)
|
||||
)
|
||||
|
||||
# Read all files in many threads.
|
||||
with ExitStack() as stack:
|
||||
files = [stack.enter_context(BytesIO(b)) for b in bytes_to_df]
|
||||
|
||||
pool = stack.enter_context(ThreadPool(8))
|
||||
|
||||
results = pool.map(parser.read_csv, files)
|
||||
first_result = results[0]
|
||||
|
||||
for result in results:
|
||||
tm.assert_frame_equal(first_result, result)
|
||||
|
||||
|
||||
def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
|
||||
"""
|
||||
Generate a DataFrame via multi-thread.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
parser : BaseParser
|
||||
The parser object to use for reading the data.
|
||||
path : str
|
||||
The location of the CSV file to read.
|
||||
num_rows : int
|
||||
The number of rows to read per task.
|
||||
num_tasks : int
|
||||
The number of tasks to use for reading this DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
|
||||
def reader(arg):
|
||||
"""
|
||||
Create a reader for part of the CSV.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arg : tuple
|
||||
A tuple of the following:
|
||||
|
||||
* start : int
|
||||
The starting row to start for parsing CSV
|
||||
* nrows : int
|
||||
The number of rows to read.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
start, nrows = arg
|
||||
|
||||
if not start:
|
||||
return parser.read_csv(
|
||||
path, index_col=0, header=0, nrows=nrows, parse_dates=["date"]
|
||||
)
|
||||
|
||||
return parser.read_csv(
|
||||
path,
|
||||
index_col=0,
|
||||
header=None,
|
||||
skiprows=int(start) + 1,
|
||||
nrows=nrows,
|
||||
parse_dates=[9],
|
||||
)
|
||||
|
||||
tasks = [
|
||||
(num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks)
|
||||
]
|
||||
|
||||
with ThreadPool(processes=num_tasks) as pool:
|
||||
results = pool.map(reader, tasks)
|
||||
|
||||
header = results[0].columns
|
||||
|
||||
for r in results[1:]:
|
||||
r.columns = header
|
||||
|
||||
final_dataframe = pd.concat(results)
|
||||
return final_dataframe
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: The 'nrows' option is not supported
|
||||
def test_multi_thread_path_multipart_read_csv(all_parsers):
|
||||
# see gh-11786
|
||||
num_tasks = 4
|
||||
num_rows = 48
|
||||
|
||||
parser = all_parsers
|
||||
file_name = "__thread_pool_reader__.csv"
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).random(num_rows),
|
||||
"b": np.random.default_rng(2).random(num_rows),
|
||||
"c": np.random.default_rng(2).random(num_rows),
|
||||
"d": np.random.default_rng(2).random(num_rows),
|
||||
"e": np.random.default_rng(2).random(num_rows),
|
||||
"foo": ["foo"] * num_rows,
|
||||
"bar": ["bar"] * num_rows,
|
||||
"baz": ["baz"] * num_rows,
|
||||
"date": pd.date_range("20000101 09:00:00", periods=num_rows, freq="s"),
|
||||
"int": np.arange(num_rows, dtype="int64"),
|
||||
}
|
||||
)
|
||||
|
||||
with tm.ensure_clean(file_name) as path:
|
||||
df.to_csv(path)
|
||||
|
||||
final_dataframe = _generate_multi_thread_dataframe(
|
||||
parser, path, num_rows, num_tasks
|
||||
)
|
||||
tm.assert_frame_equal(df, final_dataframe)
|
||||
@@ -0,0 +1,780 @@
|
||||
"""
|
||||
Tests that NA values are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.parsers import STR_NA_VALUES
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_string_nas(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
a,b,c
|
||||
d,,f
|
||||
,g,h
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
expected.loc[2, "A"] = None
|
||||
expected.loc[1, "B"] = None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_detect_string_na(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B
|
||||
foo,bar
|
||||
NA,baz
|
||||
NaN,nan
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
expected.loc[[1, 2], "A"] = None
|
||||
expected.loc[2, "B"] = None
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_values",
|
||||
[
|
||||
["-999.0", "-999"],
|
||||
[-999, -999.0],
|
||||
[-999.0, -999],
|
||||
["-999.0"],
|
||||
["-999"],
|
||||
[-999.0],
|
||||
[-999],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
"""A,B
|
||||
-999,1.2
|
||||
2,-999
|
||||
3,4.5
|
||||
""",
|
||||
"""A,B
|
||||
-999,1.200
|
||||
2,-999.000
|
||||
3,4.500
|
||||
""",
|
||||
],
|
||||
)
|
||||
def test_non_string_na_values(all_parsers, data, na_values, request):
|
||||
# see gh-3611: with an odd float format, we can't match
|
||||
# the string "999.0" exactly but still need float matching
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"])
|
||||
|
||||
if parser.engine == "pyarrow" and not all(isinstance(x, str) for x in na_values):
|
||||
msg = "The 'pyarrow' engine requires all na_values to be strings"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values=na_values)
|
||||
return
|
||||
elif parser.engine == "pyarrow" and "-999.000" in data:
|
||||
# bc the pyarrow engine does not include the float-ified version
|
||||
# of "-999" -> -999, it does not match the entry with the trailing
|
||||
# zeros, so "-999.000" is not treated as null.
|
||||
mark = pytest.mark.xfail(
|
||||
reason="pyarrow engined does not recognize equivalent floats"
|
||||
)
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_default_na_values(all_parsers):
|
||||
_NA_VALUES = {
|
||||
"-1.#IND",
|
||||
"1.#QNAN",
|
||||
"1.#IND",
|
||||
"-1.#QNAN",
|
||||
"#N/A",
|
||||
"N/A",
|
||||
"n/a",
|
||||
"NA",
|
||||
"<NA>",
|
||||
"#NA",
|
||||
"NULL",
|
||||
"null",
|
||||
"NaN",
|
||||
"nan",
|
||||
"-NaN",
|
||||
"-nan",
|
||||
"#N/A N/A",
|
||||
"",
|
||||
"None",
|
||||
}
|
||||
assert _NA_VALUES == STR_NA_VALUES
|
||||
|
||||
parser = all_parsers
|
||||
nv = len(_NA_VALUES)
|
||||
|
||||
def f(i, v):
|
||||
if i == 0:
|
||||
buf = ""
|
||||
elif i > 0:
|
||||
buf = "".join([","] * i)
|
||||
|
||||
buf = f"{buf}{v}"
|
||||
|
||||
if i < nv - 1:
|
||||
joined = "".join([","] * (nv - i - 1))
|
||||
buf = f"{buf}{joined}"
|
||||
|
||||
return buf
|
||||
|
||||
data = StringIO("\n".join([f(i, v) for i, v in enumerate(_NA_VALUES)]))
|
||||
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
|
||||
|
||||
result = parser.read_csv(data, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", ["baz", ["baz"]])
|
||||
def test_custom_na_values(all_parsers, na_values):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
ignore,this,row
|
||||
1,NA,3
|
||||
-1.#IND,5,baz
|
||||
7,8,NaN
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"]
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "skiprows argument must be an integer when using engine='pyarrow'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_bool_na_values(all_parsers):
|
||||
data = """A,B,C
|
||||
True,False,True
|
||||
NA,True,False
|
||||
False,NA,True"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": np.array([True, np.nan, False], dtype=object),
|
||||
"B": np.array([False, True, np.nan], dtype=object),
|
||||
"C": [True, False, True],
|
||||
}
|
||||
)
|
||||
if parser.engine == "pyarrow":
|
||||
expected.loc[1, "A"] = None
|
||||
expected.loc[2, "B"] = None
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_value_dict(all_parsers):
|
||||
data = """A,B,C
|
||||
foo,bar,NA
|
||||
bar,foo,foo
|
||||
foo,bar,NA
|
||||
bar,foo,foo"""
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [np.nan, "bar", np.nan, "bar"],
|
||||
"B": [np.nan, "foo", np.nan, "foo"],
|
||||
"C": [np.nan, "foo", np.nan, "foo"],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_col,expected",
|
||||
[
|
||||
(
|
||||
[0],
|
||||
DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")),
|
||||
),
|
||||
(
|
||||
[0, 2],
|
||||
DataFrame(
|
||||
{"b": [np.nan], "d": [5]},
|
||||
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
|
||||
),
|
||||
),
|
||||
(
|
||||
["a", "c"],
|
||||
DataFrame(
|
||||
{"b": [np.nan], "d": [5]},
|
||||
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_na_value_dict_multi_index(all_parsers, index_col, expected):
|
||||
data = """\
|
||||
a,b,c,d
|
||||
0,NA,1,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
(
|
||||
{},
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
{"na_values": {"A": [], "C": []}, "keep_default_na": False},
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["a", "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", "nan", "five", "", "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
{"na_values": ["a"], "keep_default_na": False},
|
||||
DataFrame(
|
||||
{
|
||||
"A": [np.nan, "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", "nan", "five", "", "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
{"na_values": {"A": [], "C": []}},
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_na_values_keep_default(
|
||||
all_parsers, kwargs, expected, request, using_infer_string
|
||||
):
|
||||
data = """\
|
||||
A,B,C
|
||||
a,1,one
|
||||
b,2,two
|
||||
,3,three
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
if "na_values" in kwargs and isinstance(kwargs["na_values"], dict):
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
if not using_infer_string or "na_values" in kwargs:
|
||||
mark = pytest.mark.xfail()
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_na_values_no_keep_default(all_parsers):
|
||||
# see gh-4318: passing na_values=None and
|
||||
# keep_default_na=False yields 'None" as a na_value
|
||||
data = """\
|
||||
A,B,C
|
||||
a,1,None
|
||||
b,2,two
|
||||
,3,None
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), keep_default_na=False)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": ["a", "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["None", "two", "None", "nan", "five", "", "seven"],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_keep_default_na_dict_na_values(all_parsers):
|
||||
# see gh-19227
|
||||
data = "a,b\n,2"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
|
||||
)
|
||||
expected = DataFrame({"a": [""], "b": [np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
|
||||
# see gh-19227
|
||||
#
|
||||
# Scalar values shouldn't cause the parsing to crash or fail.
|
||||
data = "a,b\n1,2"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
|
||||
expected = DataFrame({"a": [1], "b": [np.nan]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
|
||||
def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
|
||||
# see gh-19227
|
||||
data = """\
|
||||
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
|
||||
729639,"qwer","",asdfkj,466.681,,252.373
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: [np.nan, 729639.0],
|
||||
1: [np.nan, "qwer"],
|
||||
2: ["/blaha", np.nan],
|
||||
3: ["kjsdkj", "asdfkj"],
|
||||
4: [412.166, 466.681],
|
||||
5: ["225.874", ""],
|
||||
6: [np.nan, 252.373],
|
||||
}
|
||||
)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
keep_default_na=False,
|
||||
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
keep_default_na=False,
|
||||
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_filter,row_data",
|
||||
[
|
||||
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
|
||||
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
|
||||
],
|
||||
)
|
||||
def test_na_values_na_filter_override(
|
||||
request, all_parsers, na_filter, row_data, using_infer_string
|
||||
):
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
# mismatched dtypes in both cases, FutureWarning in the True case
|
||||
if not (using_infer_string and na_filter):
|
||||
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
|
||||
request.applymarker(mark)
|
||||
data = """\
|
||||
A,B
|
||||
1,A
|
||||
nan,B
|
||||
3,C
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)
|
||||
|
||||
expected = DataFrame(row_data, columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 8 columns, got 5:
|
||||
def test_na_trailing_columns(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
|
||||
2012-03-14,USD,AAPL,BUY,1000
|
||||
2012-05-12,USD,SBUX,SELL,500"""
|
||||
|
||||
# Trailing columns should be all NaN.
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
|
||||
["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
|
||||
],
|
||||
columns=[
|
||||
"Date",
|
||||
"Currency",
|
||||
"Symbol",
|
||||
"Type",
|
||||
"Units",
|
||||
"UnitPrice",
|
||||
"Cost",
|
||||
"Tax",
|
||||
],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_values,row_data",
|
||||
[
|
||||
(1, [[np.nan, 2.0], [2.0, np.nan]]),
|
||||
({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
|
||||
],
|
||||
)
|
||||
def test_na_values_scalar(all_parsers, na_values, row_data):
|
||||
# see gh-12224
|
||||
parser = all_parsers
|
||||
names = ["a", "b"]
|
||||
data = "1,2\n2,1"
|
||||
|
||||
if parser.engine == "pyarrow" and isinstance(na_values, dict):
|
||||
if isinstance(na_values, dict):
|
||||
err = ValueError
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
else:
|
||||
err = TypeError
|
||||
msg = "The 'pyarrow' engine requires all na_values to be strings"
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
return
|
||||
elif parser.engine == "pyarrow":
|
||||
msg = "The 'pyarrow' engine requires all na_values to be strings"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
expected = DataFrame(row_data, columns=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_values_dict_aliasing(all_parsers):
|
||||
parser = all_parsers
|
||||
na_values = {"a": 2, "b": 1}
|
||||
na_values_copy = na_values.copy()
|
||||
|
||||
names = ["a", "b"]
|
||||
data = "1,2\n2,1"
|
||||
|
||||
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_dict_equal(na_values, na_values_copy)
|
||||
|
||||
|
||||
def test_na_values_dict_col_index(all_parsers):
|
||||
# see gh-14203
|
||||
data = "a\nfoo\n1"
|
||||
parser = all_parsers
|
||||
na_values = {0: "foo"}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), na_values=na_values)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values)
|
||||
expected = DataFrame({"a": [np.nan, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
str(2**63) + "\n" + str(2**63 + 1),
|
||||
{"na_values": [2**63]},
|
||||
DataFrame([str(2**63), str(2**63 + 1)]),
|
||||
),
|
||||
(str(2**63) + ",1" + "\n,2", {}, DataFrame([[str(2**63), 1], ["", 2]])),
|
||||
(str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])),
|
||||
],
|
||||
)
|
||||
def test_na_values_uint64(all_parsers, data, kwargs, expected, request):
|
||||
# see gh-14983
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and "na_values" in kwargs:
|
||||
msg = "The 'pyarrow' engine requires all na_values to be strings"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
return
|
||||
elif parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="Returns float64 instead of object")
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_na_values_no_default_with_index(all_parsers):
|
||||
# see gh-15835
|
||||
data = "a,1\nb,2"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])]
|
||||
)
|
||||
def test_no_na_filter_on_index(all_parsers, na_filter, index_data, request):
|
||||
# see gh-5239
|
||||
#
|
||||
# Don't parse NA-values in index unless na_filter=True
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
if parser.engine == "pyarrow" and na_filter is False:
|
||||
mark = pytest.mark.xfail(reason="mismatched index result")
|
||||
request.applymarker(mark)
|
||||
|
||||
expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b"))
|
||||
result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_inf_na_values_with_int_index(all_parsers):
|
||||
# see gh-17128
|
||||
parser = all_parsers
|
||||
data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
|
||||
|
||||
# Don't fail with OverflowError with inf's and integer index column.
|
||||
out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"])
|
||||
expected = DataFrame(
|
||||
{"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx")
|
||||
)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # mismatched shape
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
|
||||
# see gh-20377
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
# na_filter=True --> missing value becomes NaN.
|
||||
# na_filter=False --> missing value remains empty string.
|
||||
empty = np.nan if na_filter else ""
|
||||
expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # mismatched exception message
|
||||
@pytest.mark.parametrize(
|
||||
"data, na_values",
|
||||
[
|
||||
("false,1\n,1\ntrue", None),
|
||||
("false,1\nnull,1\ntrue", None),
|
||||
("false,1\nnan,1\ntrue", None),
|
||||
("false,1\nfoo,1\ntrue", "foo"),
|
||||
("false,1\nfoo,1\ntrue", ["foo"]),
|
||||
("false,1\nfoo,1\ntrue", {"a": "foo"}),
|
||||
],
|
||||
)
|
||||
def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
|
||||
parser = all_parsers
|
||||
msg = "|".join(
|
||||
[
|
||||
"Bool column has NA values in column [0a]",
|
||||
"cannot safely convert passed user dtype of "
|
||||
"bool for object dtyped data in column 0",
|
||||
]
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
names=["a", "b"],
|
||||
dtype={"a": "bool"},
|
||||
na_values=na_values,
|
||||
)
|
||||
|
||||
|
||||
# TODO: this test isn't about the na_values keyword, it is about the empty entries
|
||||
# being returned with NaN entries, whereas the pyarrow engine returns "nan"
|
||||
@xfail_pyarrow # mismatched shapes
|
||||
def test_str_nan_dropped(all_parsers):
|
||||
# see gh-21131
|
||||
parser = all_parsers
|
||||
|
||||
data = """File: small.csv,,
|
||||
10010010233,0123,654
|
||||
foo,,bar
|
||||
01001000155,4530,898"""
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
names=["col1", "col2", "col3"],
|
||||
dtype={"col1": str, "col2": str, "col3": str},
|
||||
).dropna()
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": ["10010010233", "01001000155"],
|
||||
"col2": ["0123", "4530"],
|
||||
"col3": ["654", "898"],
|
||||
},
|
||||
index=[1, 3],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nan_multi_index(all_parsers):
|
||||
# GH 42446
|
||||
parser = all_parsers
|
||||
data = "A,B,B\nX,Y,Z\n1,2,inf"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine doesn't support passing a dict for na_values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
("A", "X"): [1],
|
||||
("B", "Y"): [2],
|
||||
("B", "Z"): [np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # Failed: DID NOT RAISE <class 'ValueError'>; it casts the NaN to False
|
||||
def test_bool_and_nan_to_bool(all_parsers):
|
||||
# GH#42808
|
||||
parser = all_parsers
|
||||
data = """0
|
||||
NaN
|
||||
True
|
||||
False
|
||||
"""
|
||||
with pytest.raises(ValueError, match="NA values"):
|
||||
parser.read_csv(StringIO(data), dtype="bool")
|
||||
|
||||
|
||||
def test_bool_and_nan_to_int(all_parsers):
|
||||
# GH#42808
|
||||
parser = all_parsers
|
||||
data = """0
|
||||
NaN
|
||||
True
|
||||
False
|
||||
"""
|
||||
with pytest.raises(ValueError, match="convert|NoneType"):
|
||||
parser.read_csv(StringIO(data), dtype="int")
|
||||
|
||||
|
||||
def test_bool_and_nan_to_float(all_parsers):
|
||||
# GH#42808
|
||||
parser = all_parsers
|
||||
data = """0
|
||||
NaN
|
||||
True
|
||||
False
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), dtype="float")
|
||||
expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,327 @@
|
||||
"""
|
||||
Tests parsers ability to read and parse non-local files
|
||||
and hence require a network connection to be read.
|
||||
"""
|
||||
from io import BytesIO
|
||||
import logging
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.feather_format import read_feather
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.parametrize("mode", ["explicit", "infer"])
|
||||
@pytest.mark.parametrize("engine", ["python", "c"])
|
||||
def test_compressed_urls(
|
||||
httpserver,
|
||||
datapath,
|
||||
salaries_table,
|
||||
mode,
|
||||
engine,
|
||||
compression_only,
|
||||
compression_to_extension,
|
||||
):
|
||||
# test reading compressed urls with various engines and
|
||||
# extension inference
|
||||
if compression_only == "tar":
|
||||
pytest.skip("TODO: Add tar salaraies.csv to pandas/io/parsers/data")
|
||||
|
||||
extension = compression_to_extension[compression_only]
|
||||
with open(datapath("io", "parser", "data", "salaries.csv" + extension), "rb") as f:
|
||||
httpserver.serve_content(content=f.read())
|
||||
|
||||
url = httpserver.url + "/salaries.csv" + extension
|
||||
|
||||
if mode != "explicit":
|
||||
compression_only = mode
|
||||
|
||||
url_table = read_csv(url, sep="\t", compression=compression_only, engine=engine)
|
||||
tm.assert_frame_equal(url_table, salaries_table)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.single_cpu
|
||||
def test_url_encoding_csv(httpserver, datapath):
|
||||
"""
|
||||
read_csv should honor the requested encoding for URLs.
|
||||
|
||||
GH 10424
|
||||
"""
|
||||
with open(datapath("io", "parser", "data", "unicode_series.csv"), "rb") as f:
|
||||
httpserver.serve_content(content=f.read())
|
||||
df = read_csv(httpserver.url, encoding="latin-1", header=None)
|
||||
assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tips_df(datapath):
|
||||
"""DataFrame with the tips dataset."""
|
||||
return read_csv(datapath("io", "data", "csv", "tips.csv"))
|
||||
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.usefixtures("s3_resource")
|
||||
@td.skip_if_not_us_locale()
|
||||
class TestS3:
|
||||
def test_parse_public_s3_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
# more of an integration test due to the not-public contents portion
|
||||
# can probably mock this though.
|
||||
pytest.importorskip("s3fs")
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_private_s3_bucket(self, s3_private_bucket_with_data, tips_df, s3so):
|
||||
# Read public file from bucket with not-public contents
|
||||
pytest.importorskip("s3fs")
|
||||
df = read_csv(
|
||||
f"s3://{s3_private_bucket_with_data.name}/tips.csv", storage_options=s3so
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3n_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
# Read from AWS s3 as "s3n" URL
|
||||
df = read_csv(
|
||||
f"s3n://{s3_public_bucket_with_data.name}/tips.csv",
|
||||
nrows=10,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3a_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
# Read from AWS s3 as "s3a" URL
|
||||
df = read_csv(
|
||||
f"s3a://{s3_public_bucket_with_data.name}/tips.csv",
|
||||
nrows=10,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
nrows=10,
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
# Read with a chunksize
|
||||
chunksize = 5
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
with read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
chunksize=chunksize,
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
) as df_reader:
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them
|
||||
# properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[
|
||||
chunksize * i_chunk : chunksize * (i_chunk + 1)
|
||||
]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked_python(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
# Read with a chunksize using the Python parser
|
||||
chunksize = 5
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
with read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
chunksize=chunksize,
|
||||
compression=comp,
|
||||
engine="python",
|
||||
storage_options=s3so,
|
||||
) as df_reader:
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[
|
||||
chunksize * i_chunk : chunksize * (i_chunk + 1)
|
||||
]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_python(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
engine="python",
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_infer_s3_compression(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
for ext in ["", ".gz", ".bz2"]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
engine="python",
|
||||
compression="infer",
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows_python(
|
||||
self, s3_public_bucket_with_data, tips_df, s3so
|
||||
):
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
|
||||
engine="python",
|
||||
nrows=10,
|
||||
compression=comp,
|
||||
storage_options=s3so,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_read_s3_fails(self, s3so):
|
||||
msg = "The specified bucket does not exist"
|
||||
with pytest.raises(OSError, match=msg):
|
||||
read_csv("s3://nyqpug/asdf.csv", storage_options=s3so)
|
||||
|
||||
def test_read_s3_fails_private(self, s3_private_bucket, s3so):
|
||||
msg = "The specified bucket does not exist"
|
||||
# Receive a permission error when trying to read a private bucket.
|
||||
# It's irrelevant here that this isn't actually a table.
|
||||
with pytest.raises(OSError, match=msg):
|
||||
read_csv(f"s3://{s3_private_bucket.name}/file.csv")
|
||||
|
||||
@pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
|
||||
def test_write_s3_csv_fails(self, tips_df, s3so):
|
||||
# GH 32486
|
||||
# Attempting to write to an invalid S3 path should raise
|
||||
import botocore
|
||||
|
||||
# GH 34087
|
||||
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
|
||||
# Catch a ClientError since AWS Service Errors are defined dynamically
|
||||
error = (FileNotFoundError, botocore.exceptions.ClientError)
|
||||
|
||||
with pytest.raises(error, match="The specified bucket does not exist"):
|
||||
tips_df.to_csv(
|
||||
"s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so
|
||||
)
|
||||
|
||||
@pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
|
||||
def test_write_s3_parquet_fails(self, tips_df, s3so):
|
||||
# GH 27679
|
||||
# Attempting to write to an invalid S3 path should raise
|
||||
pytest.importorskip("pyarrow")
|
||||
import botocore
|
||||
|
||||
# GH 34087
|
||||
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
|
||||
# Catch a ClientError since AWS Service Errors are defined dynamically
|
||||
error = (FileNotFoundError, botocore.exceptions.ClientError)
|
||||
|
||||
with pytest.raises(error, match="The specified bucket does not exist"):
|
||||
tips_df.to_parquet(
|
||||
"s3://an_s3_bucket_data_doesnt_exit/not_real.parquet",
|
||||
storage_options=s3so,
|
||||
)
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
def test_read_csv_handles_boto_s3_object(
|
||||
self, s3_public_bucket_with_data, tips_file
|
||||
):
|
||||
# see gh-16135
|
||||
|
||||
s3_object = s3_public_bucket_with_data.Object("tips.csv")
|
||||
|
||||
with BytesIO(s3_object.get()["Body"].read()) as buffer:
|
||||
result = read_csv(buffer, encoding="utf8")
|
||||
assert isinstance(result, DataFrame)
|
||||
assert not result.empty
|
||||
|
||||
expected = read_csv(tips_file)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so):
|
||||
# 8 MB, S3FS uses 5MB chunks
|
||||
df = DataFrame(np.zeros((100000, 4)), columns=list("abcd"))
|
||||
with BytesIO(df.to_csv().encode("utf-8")) as buf:
|
||||
s3_public_bucket.put_object(Key="large-file.csv", Body=buf)
|
||||
uri = f"{s3_public_bucket.name}/large-file.csv"
|
||||
match_re = re.compile(rf"^Fetch: {uri}, 0-(?P<stop>\d+)$")
|
||||
with caplog.at_level(logging.DEBUG, logger="s3fs"):
|
||||
read_csv(
|
||||
f"s3://{uri}",
|
||||
nrows=5,
|
||||
storage_options=s3so,
|
||||
)
|
||||
for log in caplog.messages:
|
||||
if match := re.match(match_re, log):
|
||||
# Less than 8 MB
|
||||
assert int(match.group("stop")) < 8000000
|
||||
|
||||
def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so):
|
||||
# GH 25945
|
||||
result = read_csv(
|
||||
f"s3://{s3_public_bucket_with_data.name}/tips#1.csv", storage_options=s3so
|
||||
)
|
||||
tm.assert_frame_equal(tips_df, result)
|
||||
|
||||
def test_read_feather_s3_file_path(
|
||||
self, s3_public_bucket_with_data, feather_file, s3so
|
||||
):
|
||||
# GH 29055
|
||||
pytest.importorskip("pyarrow")
|
||||
expected = read_feather(feather_file)
|
||||
res = read_feather(
|
||||
f"s3://{s3_public_bucket_with_data.name}/simple_dataset.feather",
|
||||
storage_options=s3so,
|
||||
)
|
||||
tm.assert_frame_equal(expected, res)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,566 @@
|
||||
"""
|
||||
Tests that apply specifically to the Python parser. Unless specifically
|
||||
stated as a Python-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the C parser can accept further
|
||||
arguments when parsing.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
TextIOWrapper,
|
||||
)
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import (
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
|
||||
|
||||
def test_default_separator(python_parser_only):
|
||||
# see gh-17333
|
||||
#
|
||||
# csv.Sniffer in Python treats "o" as separator.
|
||||
data = "aob\n1o2\n3o4"
|
||||
parser = python_parser_only
|
||||
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
|
||||
def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
|
||||
# see gh-15925 (comment)
|
||||
data = "a\n1\n2"
|
||||
parser = python_parser_only
|
||||
msg = "skipfooter must be an integer"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
|
||||
|
||||
def test_invalid_skipfooter_negative(python_parser_only):
|
||||
# see gh-15925 (comment)
|
||||
data = "a\n1\n2"
|
||||
parser = python_parser_only
|
||||
msg = "skipfooter cannot be negative"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=-1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{"sep": None}, {"delimiter": "|"}])
|
||||
def test_sniff_delimiter(python_parser_only, kwargs):
|
||||
data = """index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_sniff_delimiter_comment(python_parser_only):
|
||||
data = """# comment line
|
||||
index|A|B|C
|
||||
# comment line
|
||||
foo|1|2|3 # ignore | this
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), index_col=0, sep=None, comment="#")
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_sniff_delimiter_encoding(python_parser_only, encoding):
|
||||
parser = python_parser_only
|
||||
data = """ignore this
|
||||
ignore this too
|
||||
index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
|
||||
if encoding is not None:
|
||||
data = data.encode(encoding)
|
||||
data = BytesIO(data)
|
||||
data = TextIOWrapper(data, encoding=encoding)
|
||||
else:
|
||||
data = StringIO(data)
|
||||
|
||||
result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_single_line(python_parser_only):
|
||||
# see gh-6607: sniff separator
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None)
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{"skipfooter": 2}, {"nrows": 3}])
|
||||
def test_skipfooter(python_parser_only, kwargs):
|
||||
# see gh-6607
|
||||
data = """A,B,C
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
want to skip this
|
||||
also also skip this
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")]
|
||||
)
|
||||
def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
|
||||
# see gh-6607
|
||||
parser = python_parser_only
|
||||
|
||||
with open(csv1, "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
data = data.replace(b",", b"::")
|
||||
expected = parser.read_csv(csv1)
|
||||
|
||||
module = pytest.importorskip(compression)
|
||||
klass = getattr(module, klass)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with klass(path, mode="wb") as tmp:
|
||||
tmp.write(data)
|
||||
|
||||
result = parser.read_csv(path, sep="::", compression=compression)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_buglet_4x_multi_index(python_parser_only):
|
||||
# see gh-6607
|
||||
data = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
parser = python_parser_only
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
|
||||
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
|
||||
[-0.6662, -0.5243, -0.3580, 0.89145, 2.5838],
|
||||
],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
index=MultiIndex.from_tuples(
|
||||
[("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)],
|
||||
names=["one", "two", "three", "four"],
|
||||
),
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_buglet_4x_multi_index2(python_parser_only):
|
||||
# see gh-6893
|
||||
data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
|
||||
parser = python_parser_only
|
||||
|
||||
expected = DataFrame.from_records(
|
||||
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
|
||||
columns=list("abcABC"),
|
||||
index=list("abc"),
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("add_footer", [True, False])
|
||||
def test_skipfooter_with_decimal(python_parser_only, add_footer):
|
||||
# see gh-6971
|
||||
data = "1#2\n3#4"
|
||||
parser = python_parser_only
|
||||
expected = DataFrame({"a": [1.2, 3.4]})
|
||||
|
||||
if add_footer:
|
||||
# The stray footer line should not mess with the
|
||||
# casting of the first two lines if we skip it.
|
||||
kwargs = {"skipfooter": 1}
|
||||
data += "\nFooter"
|
||||
else:
|
||||
kwargs = {}
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"]
|
||||
)
|
||||
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
|
||||
# see gh-3404
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
parser = python_parser_only
|
||||
|
||||
data = "1" + sep + "2"
|
||||
encoded_data = data.encode(encoding)
|
||||
|
||||
result = parser.read_csv(
|
||||
BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
|
||||
def test_multi_char_sep_quotes(python_parser_only, quoting):
|
||||
# see gh-13374
|
||||
kwargs = {"sep": ",,"}
|
||||
parser = python_parser_only
|
||||
|
||||
data = 'a,,b\n1,,a\n2,,"2,,b"'
|
||||
|
||||
if quoting == csv.QUOTE_NONE:
|
||||
msg = "Expected 2 fields in line 3, saw 3"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
|
||||
else:
|
||||
msg = "ignored when a multi-char delimiter is used"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
|
||||
|
||||
|
||||
def test_none_delimiter(python_parser_only):
|
||||
# see gh-13374 and gh-17465
|
||||
parser = python_parser_only
|
||||
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
|
||||
expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
|
||||
|
||||
# We expect the third line in the data to be
|
||||
# skipped because it is malformed, but we do
|
||||
# not expect any errors to occur.
|
||||
with tm.assert_produces_warning(
|
||||
ParserWarning, match="Skipping line 3", check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=0, sep=None, on_bad_lines="warn"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
|
||||
@pytest.mark.parametrize("skipfooter", [0, 1])
|
||||
def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
|
||||
# see gh-13879 and gh-15910
|
||||
parser = python_parser_only
|
||||
if skipfooter:
|
||||
msg = "parsing errors in the skipped footer rows"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
else:
|
||||
msg = "unexpected end of data|expected after"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
|
||||
|
||||
def test_malformed_skipfooter(python_parser_only):
|
||||
parser = python_parser_only
|
||||
data = """ignore
|
||||
A,B,C
|
||||
1,2,3 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
footer
|
||||
"""
|
||||
msg = "Expected 3 fields in line 4, saw 5"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
|
||||
|
||||
|
||||
def test_python_engine_file_no_next(python_parser_only):
|
||||
parser = python_parser_only
|
||||
|
||||
class NoNextBuffer:
|
||||
def __init__(self, csv_data) -> None:
|
||||
self.data = csv_data
|
||||
|
||||
def __iter__(self) -> Iterator:
|
||||
return self.data.__iter__()
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
def readline(self):
|
||||
return self.data
|
||||
|
||||
parser.read_csv(NoNextBuffer("a\n1"))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]])
|
||||
def test_on_bad_lines_callable(python_parser_only, bad_line_func):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
|
||||
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_on_bad_lines_callable_write_to_external_list(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
lst = []
|
||||
|
||||
def bad_line_func(bad_line: list[str]) -> list[str]:
|
||||
lst.append(bad_line)
|
||||
return ["2", "3"]
|
||||
|
||||
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
|
||||
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert lst == [["2", "3", "4", "5", "6"]]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]])
|
||||
@pytest.mark.parametrize("sep", [",", "111"])
|
||||
def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep):
|
||||
# GH 5686
|
||||
# iterator=True has a separate code path than iterator=False
|
||||
parser = python_parser_only
|
||||
data = f"""
|
||||
0{sep}1
|
||||
hi{sep}there
|
||||
foo{sep}bar{sep}baz
|
||||
good{sep}bye
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
result_iter = parser.read_csv(
|
||||
bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep
|
||||
)
|
||||
expecteds = [
|
||||
{"0": "hi", "1": "there"},
|
||||
{"0": "foo", "1": "bar"},
|
||||
{"0": "good", "1": "bye"},
|
||||
]
|
||||
for i, (result, expected) in enumerate(zip(result_iter, expecteds)):
|
||||
expected = DataFrame(expected, index=range(i, i + 1))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
msg = "This function is buggy."
|
||||
|
||||
def bad_line_func(bad_line):
|
||||
raise ValueError(msg)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
|
||||
|
||||
|
||||
def test_on_bad_lines_callable_not_expected_length(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
|
||||
result = parser.read_csv_check_warnings(
|
||||
ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
|
||||
)
|
||||
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_on_bad_lines_callable_returns_none(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2
|
||||
2,3,4,5,6
|
||||
3,4
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
|
||||
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None)
|
||||
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_on_bad_lines_index_col_inferred(python_parser_only):
|
||||
# GH 5686
|
||||
parser = python_parser_only
|
||||
data = """a,b
|
||||
1,2,3
|
||||
4,5,6
|
||||
"""
|
||||
bad_sio = StringIO(data)
|
||||
|
||||
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"])
|
||||
expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_col_false_and_header_none(python_parser_only):
|
||||
# GH#46955
|
||||
parser = python_parser_only
|
||||
data = """
|
||||
0.5,0.03
|
||||
0.1,0.2,0.3,2
|
||||
"""
|
||||
result = parser.read_csv_check_warnings(
|
||||
ParserWarning,
|
||||
"Length of header",
|
||||
StringIO(data),
|
||||
sep=",",
|
||||
header=None,
|
||||
index_col=False,
|
||||
)
|
||||
expected = DataFrame({0: [0.5, 0.1], 1: [0.03, 0.2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parser_only):
|
||||
# GH#46569
|
||||
parser = python_parser_only
|
||||
data = StringIO("a\na,b\nc,d,e\nf,g,h")
|
||||
result = parser.read_csv_check_warnings(
|
||||
ParserWarning, "Length of header", data, engine="python", index_col=False
|
||||
)
|
||||
expected = DataFrame({"a": ["a", "c", "f"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}]
|
||||
)
|
||||
def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype):
|
||||
# GH#50270
|
||||
parser = python_parser_only
|
||||
data = """\
|
||||
a;b;c
|
||||
0000.7995;16.000;0
|
||||
3.03.001.00514;0;4.000
|
||||
4923.600.041;23.000;131"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
sep=";",
|
||||
dtype=dtype,
|
||||
thousands=".",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": ["0000.7995", "3.03.001.00514", "4923.600.041"],
|
||||
"b": [16000, 0, 23000],
|
||||
"c": [0, 4000, 131],
|
||||
}
|
||||
)
|
||||
if dtype["a"] == object:
|
||||
expected["a"] = expected["a"].astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,expected",
|
||||
[
|
||||
(
|
||||
{"a": str, "b": np.float64, "c": np.int64},
|
||||
DataFrame(
|
||||
{
|
||||
"b": [16000.1, 0, 23000],
|
||||
"c": [0, 4001, 131],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
str,
|
||||
DataFrame(
|
||||
{
|
||||
"b": ["16,000.1", "0", "23,000"],
|
||||
"c": ["0", "4,001", "131"],
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected):
|
||||
# GH#50270
|
||||
parser = python_parser_only
|
||||
data = """a;b;c
|
||||
0000,7995;16,000.1;0
|
||||
3,03,001,00514;0;4,001
|
||||
4923,600,041;23,000;131
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
sep=";",
|
||||
dtype=dtype,
|
||||
thousands=",",
|
||||
)
|
||||
expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,183 @@
|
||||
"""
|
||||
Tests that quoting specifications are properly handled
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY311
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'),
|
||||
(
|
||||
{"quotechar": None, "quoting": csv.QUOTE_MINIMAL},
|
||||
"quotechar must be set if quoting enabled",
|
||||
),
|
||||
({"quotechar": 2}, '"quotechar" must be string( or None)?, not int'),
|
||||
],
|
||||
)
|
||||
@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block
|
||||
def test_bad_quote_char(all_parsers, kwargs, msg):
|
||||
data = "1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"quoting,msg",
|
||||
[
|
||||
("foo", '"quoting" must be an integer|Argument'),
|
||||
(10, 'bad "quoting" value'), # quoting must be in the range [0, 3]
|
||||
],
|
||||
)
|
||||
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
|
||||
def test_bad_quoting(all_parsers, quoting, msg):
|
||||
data = "1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting)
|
||||
|
||||
|
||||
def test_quote_char_basic(all_parsers):
|
||||
parser = all_parsers
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar='"')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
|
||||
def test_quote_char_various(all_parsers, quote_char):
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
|
||||
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
new_data = data.replace('"', quote_char)
|
||||
|
||||
result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
|
||||
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
|
||||
@pytest.mark.parametrize("quote_char", ["", None])
|
||||
def test_null_quote_char(all_parsers, quoting, quote_char):
|
||||
kwargs = {"quotechar": quote_char, "quoting": quoting}
|
||||
data = "a,b,c\n1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
if quoting != csv.QUOTE_NONE:
|
||||
# Sanity checking.
|
||||
msg = (
|
||||
'"quotechar" must be a 1-character string'
|
||||
if PY311 and all_parsers.engine == "python" and quote_char == ""
|
||||
else "quotechar must be set if quoting enabled"
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
elif not (PY311 and all_parsers.engine == "python"):
|
||||
# Python 3.11+ doesn't support null/blank quote chars in their csv parsers
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,exp_data",
|
||||
[
|
||||
({}, [[1, 2, "foo"]]), # Test default.
|
||||
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
|
||||
({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]),
|
||||
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
|
||||
({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]),
|
||||
# QUOTE_NONE tells the reader to do no special handling
|
||||
# of quote characters and leave them alone.
|
||||
({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]),
|
||||
# QUOTE_NONNUMERIC tells the reader to cast
|
||||
# all non-quoted fields to float
|
||||
({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]),
|
||||
],
|
||||
)
|
||||
@xfail_pyarrow # ValueError: The 'quoting' option is not supported
|
||||
def test_quoting_various(all_parsers, kwargs, exp_data):
|
||||
data = '1,2,"foo"'
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=columns, **kwargs)
|
||||
expected = DataFrame(exp_data, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
|
||||
)
|
||||
def test_double_quote(all_parsers, doublequote, exp_data, request):
|
||||
parser = all_parsers
|
||||
data = 'a,b\n3,"4 "" 5"'
|
||||
|
||||
if parser.engine == "pyarrow" and not doublequote:
|
||||
mark = pytest.mark.xfail(reason="Mismatched result")
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
|
||||
expected = DataFrame(exp_data, columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quotechar", ['"', "\u0001"])
|
||||
def test_quotechar_unicode(all_parsers, quotechar):
|
||||
# see gh-14477
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar=quotechar)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("balanced", [True, False])
|
||||
def test_unbalanced_quoting(all_parsers, balanced, request):
|
||||
# see gh-22789.
|
||||
parser = all_parsers
|
||||
data = 'a,b,c\n1,2,"3'
|
||||
|
||||
if parser.engine == "pyarrow" and not balanced:
|
||||
mark = pytest.mark.xfail(reason="Mismatched result")
|
||||
request.applymarker(mark)
|
||||
|
||||
if balanced:
|
||||
# Re-balance the quoting and read in without errors.
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
result = parser.read_csv(StringIO(data + '"'))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
msg = (
|
||||
"EOF inside string starting at row 1"
|
||||
if parser.engine == "c"
|
||||
else "unexpected end of data"
|
||||
)
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,334 @@
|
||||
"""
|
||||
Tests that skipped rows are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import EmptyDataError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
@pytest.mark.parametrize("skiprows", [list(range(6)), 6])
|
||||
def test_skip_rows_bug(all_parsers, skiprows):
|
||||
# see gh-505
|
||||
parser = all_parsers
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True
|
||||
)
|
||||
index = Index(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_deep_skip_rows(all_parsers):
|
||||
# see gh-4382
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n" + "\n".join(
|
||||
[",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
|
||||
)
|
||||
condensed_data = "a,b,c\n" + "\n".join(
|
||||
[",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]
|
||||
)
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=[6, 8])
|
||||
condensed_result = parser.read_csv(StringIO(condensed_data))
|
||||
tm.assert_frame_equal(result, condensed_result)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame are different
|
||||
def test_skip_rows_blank(all_parsers):
|
||||
# see gh-9832
|
||||
parser = all_parsers
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
data = parser.read_csv(
|
||||
StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
|
||||
)
|
||||
index = Index(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
|
||||
)
|
||||
tm.assert_frame_equal(data, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line 11
|
||||
line 12",2
|
||||
2,"line 21
|
||||
line 22",2
|
||||
3,"line 31",1""",
|
||||
{"skiprows": [1]},
|
||||
DataFrame(
|
||||
[[2, "line 21\nline 22", 2], [3, "line 31", 1]],
|
||||
columns=["id", "text", "num_lines"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
|
||||
{"quotechar": "~", "skiprows": [2]},
|
||||
DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
|
||||
),
|
||||
(
|
||||
(
|
||||
"Text,url\n~example\n "
|
||||
"sentence\n one~,url1\n~"
|
||||
"example\n sentence\n two~,url2\n~"
|
||||
"example\n sentence\n three~,url3"
|
||||
),
|
||||
{"quotechar": "~", "skiprows": [1, 3]},
|
||||
DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_row_with_quote(all_parsers):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
data = """id,text,num_lines
|
||||
1,"line '11' line 12",2
|
||||
2,"line '21' line 22",2
|
||||
3,"line '31' line 32",1"""
|
||||
|
||||
exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
|
||||
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,exp_data",
|
||||
[
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line \n'11' line 12",2
|
||||
2,"line \n'21' line 22",2
|
||||
3,"line \n'31' line 32",1""",
|
||||
[[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
|
||||
),
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line '11\n' line 12",2
|
||||
2,"line '21\n' line 22",2
|
||||
3,"line '31\n' line 32",1""",
|
||||
[[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
|
||||
),
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line '11\n' \r\tline 12",2
|
||||
2,"line '21\n' \r\tline 22",2
|
||||
3,"line '31\n' \r\tline 32",1""",
|
||||
[[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
|
||||
),
|
||||
],
|
||||
)
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), skiprows=[1])
|
||||
|
||||
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: The 'delim_whitespace' option is not supported
|
||||
@pytest.mark.parametrize(
|
||||
"lineterminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR"
|
||||
)
|
||||
def test_skiprows_lineterminator(all_parsers, lineterminator, request):
|
||||
# see gh-9079
|
||||
parser = all_parsers
|
||||
data = "\n".join(
|
||||
[
|
||||
"SMOSMANIA ThetaProbe-ML2X ",
|
||||
"2007/01/01 01:00 0.2140 U M ",
|
||||
"2007/01/01 02:00 0.2141 M O ",
|
||||
"2007/01/01 04:00 0.2142 D M ",
|
||||
]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
["2007/01/01", "01:00", 0.2140, "U", "M"],
|
||||
["2007/01/01", "02:00", 0.2141, "M", "O"],
|
||||
["2007/01/01", "04:00", 0.2142, "D", "M"],
|
||||
],
|
||||
columns=["date", "time", "var", "flag", "oflag"],
|
||||
)
|
||||
|
||||
if parser.engine == "python" and lineterminator == "\r":
|
||||
mark = pytest.mark.xfail(reason="'CR' not respect with the Python parser yet")
|
||||
request.applymarker(mark)
|
||||
|
||||
data = data.replace("\n", lineterminator)
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
skiprows=1,
|
||||
delim_whitespace=True,
|
||||
names=["date", "time", "var", "flag", "oflag"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame are different
|
||||
def test_skiprows_infield_quote(all_parsers):
|
||||
# see gh-14459
|
||||
parser = all_parsers
|
||||
data = 'a"\nb"\na\n1'
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
({}, DataFrame({"1": [3, 5]})),
|
||||
({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})),
|
||||
],
|
||||
)
|
||||
def test_skip_rows_callable(all_parsers, kwargs, expected):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_rows_callable_not_in(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "0,a\n1,b\n2,c\n3,d\n4,e"
|
||||
expected = DataFrame([[1, "b"], [3, "d"]])
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_rows_skip_all(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
msg = "No columns to parse from file"
|
||||
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data), skiprows=lambda x: True)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_rows_bad_callable(all_parsers):
|
||||
msg = "by zero"
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
|
||||
with pytest.raises(ZeroDivisionError, match=msg):
|
||||
parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: skiprows argument must be an integer
|
||||
def test_skip_rows_and_n_rows(all_parsers):
|
||||
# GH#44021
|
||||
data = """a,b
|
||||
1,a
|
||||
2,b
|
||||
3,c
|
||||
4,d
|
||||
5,e
|
||||
6,f
|
||||
7,g
|
||||
8,h
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
|
||||
expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow
|
||||
def test_skip_rows_with_chunks(all_parsers):
|
||||
# GH 55677
|
||||
data = """col_a
|
||||
10
|
||||
20
|
||||
30
|
||||
40
|
||||
50
|
||||
60
|
||||
70
|
||||
80
|
||||
90
|
||||
100
|
||||
"""
|
||||
parser = all_parsers
|
||||
reader = parser.read_csv(
|
||||
StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
|
||||
)
|
||||
df1 = next(reader)
|
||||
df2 = next(reader)
|
||||
|
||||
tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]}))
|
||||
tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))
|
||||
@@ -0,0 +1,342 @@
|
||||
"""
|
||||
Tests the TextReader class in parsers.pyx, which
|
||||
is integral to the C engine in parsers.py
|
||||
"""
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas._libs.parsers as parser
|
||||
from pandas._libs.parsers import TextReader
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import (
|
||||
TextFileReader,
|
||||
read_csv,
|
||||
)
|
||||
from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs
|
||||
|
||||
|
||||
class TestTextReader:
|
||||
@pytest.fixture
|
||||
def csv_path(self, datapath):
|
||||
return datapath("io", "data", "csv", "test1.csv")
|
||||
|
||||
def test_file_handle(self, csv_path):
|
||||
with open(csv_path, "rb") as f:
|
||||
reader = TextReader(f)
|
||||
reader.read()
|
||||
|
||||
def test_file_handle_mmap(self, csv_path):
|
||||
# this was never using memory_map=True
|
||||
with open(csv_path, "rb") as f:
|
||||
reader = TextReader(f, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_StringIO(self, csv_path):
|
||||
with open(csv_path, "rb") as f:
|
||||
text = f.read()
|
||||
src = BytesIO(text)
|
||||
reader = TextReader(src, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_string_factorize(self):
|
||||
# should this be optional?
|
||||
data = "a\nb\na\nb\na"
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
assert len(set(map(id, result[0]))) == 2
|
||||
|
||||
def test_skipinitialspace(self):
|
||||
data = "a, b\na, b\na, b\na, b"
|
||||
|
||||
reader = TextReader(StringIO(data), skipinitialspace=True, header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
result[0], np.array(["a", "a", "a", "a"], dtype=np.object_)
|
||||
)
|
||||
tm.assert_numpy_array_equal(
|
||||
result[1], np.array(["b", "b", "b", "b"], dtype=np.object_)
|
||||
)
|
||||
|
||||
def test_parse_booleans(self):
|
||||
data = "True\nFalse\nTrue\nTrue"
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == np.bool_
|
||||
|
||||
def test_delimit_whitespace(self):
|
||||
data = 'a b\na\t\t "b"\n"a"\t \t b'
|
||||
|
||||
reader = TextReader(StringIO(data), delim_whitespace=True, header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
result[0], np.array(["a", "a", "a"], dtype=np.object_)
|
||||
)
|
||||
tm.assert_numpy_array_equal(
|
||||
result[1], np.array(["b", "b", "b"], dtype=np.object_)
|
||||
)
|
||||
|
||||
def test_embedded_newline(self):
|
||||
data = 'a\n"hello\nthere"\nthis'
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_)
|
||||
tm.assert_numpy_array_equal(result[0], expected)
|
||||
|
||||
def test_euro_decimal(self):
|
||||
data = "12345,67\n345,678"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([12345.67, 345.678])
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands(self):
|
||||
data = "123,456\n12,500"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([123456, 12500], dtype=np.int64)
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands_alt(self):
|
||||
data = "123.456\n12.500"
|
||||
|
||||
reader = TextFileReader(
|
||||
StringIO(data), delimiter=":", thousands=".", header=None
|
||||
)
|
||||
result = reader.read()
|
||||
|
||||
expected = DataFrame([123456, 12500])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_skip_bad_lines(self):
|
||||
# too many lines, see #2430 for why
|
||||
data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=":", header=None)
|
||||
msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4"
|
||||
with pytest.raises(parser.ParserError, match=msg):
|
||||
reader.read()
|
||||
|
||||
reader = TextReader(
|
||||
StringIO(data), delimiter=":", header=None, on_bad_lines=2 # Skip
|
||||
)
|
||||
result = reader.read()
|
||||
expected = {
|
||||
0: np.array(["a", "d", "g", "l"], dtype=object),
|
||||
1: np.array(["b", "e", "h", "m"], dtype=object),
|
||||
2: np.array(["c", "f", "i", "n"], dtype=object),
|
||||
}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
with tm.assert_produces_warning(ParserWarning, match="Skipping line"):
|
||||
reader = TextReader(
|
||||
StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
|
||||
)
|
||||
reader.read()
|
||||
|
||||
def test_header_not_enough_lines(self):
|
||||
data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=",", header=2)
|
||||
header = reader.header
|
||||
expected = [["a", "b", "c"]]
|
||||
assert header == expected
|
||||
|
||||
recs = reader.read()
|
||||
expected = {
|
||||
0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array([2, 5], dtype=np.int64),
|
||||
2: np.array([3, 6], dtype=np.int64),
|
||||
}
|
||||
assert_array_dicts_equal(recs, expected)
|
||||
|
||||
def test_escapechar(self):
|
||||
data = '\\"hello world"\n\\"hello world"\n\\"hello world"'
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\")
|
||||
result = reader.read()
|
||||
expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
def test_eof_has_eol(self):
|
||||
# handling of new line at EOF
|
||||
pass
|
||||
|
||||
def test_na_substitution(self):
|
||||
pass
|
||||
|
||||
def test_numpy_string_dtype(self):
|
||||
data = """\
|
||||
a,1
|
||||
aa,2
|
||||
aaa,3
|
||||
aaaa,4
|
||||
aaaaa,5"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
if "dtype" in kwds:
|
||||
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
|
||||
return TextReader(StringIO(data), delimiter=",", header=None, **kwds)
|
||||
|
||||
reader = _make_reader(dtype="S5,i4")
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == "S5"
|
||||
|
||||
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5")
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == "i4"
|
||||
|
||||
reader = _make_reader(dtype="S4")
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "S4"
|
||||
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4")
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == "S4"
|
||||
|
||||
def test_pass_dtype(self):
|
||||
data = """\
|
||||
one,two
|
||||
1,a
|
||||
2,b
|
||||
3,c
|
||||
4,d"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
if "dtype" in kwds:
|
||||
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
|
||||
return TextReader(StringIO(data), delimiter=",", **kwds)
|
||||
|
||||
reader = _make_reader(dtype={"one": "u1", 1: "S1"})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "u1"
|
||||
assert result[1].dtype == "S1"
|
||||
|
||||
reader = _make_reader(dtype={"one": np.uint8, 1: object})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "u1"
|
||||
assert result[1].dtype == "O"
|
||||
|
||||
reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "u1"
|
||||
assert result[1].dtype == "O"
|
||||
|
||||
def test_usecols(self):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=",", **kwds)
|
||||
|
||||
reader = _make_reader(usecols=(1, 2))
|
||||
result = reader.read()
|
||||
|
||||
exp = _make_reader().read()
|
||||
assert len(result) == 2
|
||||
assert (result[1] == exp[1]).all()
|
||||
assert (result[2] == exp[2]).all()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, kwargs",
|
||||
[
|
||||
("a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12", {"delimiter": ","}),
|
||||
(
|
||||
"a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12",
|
||||
{"delim_whitespace": True},
|
||||
),
|
||||
("a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12", {"delimiter": ","}),
|
||||
(
|
||||
(
|
||||
"A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r"
|
||||
"AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r"
|
||||
",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0"
|
||||
),
|
||||
{"delimiter": ","},
|
||||
),
|
||||
("A B C\r 2 3\r4 5 6", {"delim_whitespace": True}),
|
||||
("A B C\r2 3\r4 5 6", {"delim_whitespace": True}),
|
||||
],
|
||||
)
|
||||
def test_cr_delimited(self, text, kwargs):
|
||||
nice_text = text.replace("\r", "\r\n")
|
||||
result = TextReader(StringIO(text), **kwargs).read()
|
||||
expected = TextReader(StringIO(nice_text), **kwargs).read()
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
def test_empty_field_eof(self):
|
||||
data = "a,b,c\n1,2,3\n4,,"
|
||||
|
||||
result = TextReader(StringIO(data), delimiter=",").read()
|
||||
|
||||
expected = {
|
||||
0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array(["2", ""], dtype=object),
|
||||
2: np.array(["3", ""], dtype=object),
|
||||
}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("repeat", range(10))
|
||||
def test_empty_field_eof_mem_access_bug(self, repeat):
|
||||
# GH5664
|
||||
a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"])
|
||||
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
|
||||
c = DataFrame(
|
||||
[
|
||||
[1, 2, 3, 4],
|
||||
[6, np.nan, np.nan, np.nan],
|
||||
[8, 9, 10, 11],
|
||||
[13, 14, np.nan, np.nan],
|
||||
],
|
||||
columns=list("abcd"),
|
||||
index=[0, 5, 7, 12],
|
||||
)
|
||||
|
||||
df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
|
||||
tm.assert_frame_equal(df, a)
|
||||
|
||||
df = read_csv(
|
||||
StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
|
||||
)
|
||||
tm.assert_frame_equal(df, b)
|
||||
|
||||
df = read_csv(
|
||||
StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
|
||||
names=list("abcd"),
|
||||
engine="c",
|
||||
)
|
||||
tm.assert_frame_equal(df, c)
|
||||
|
||||
def test_empty_csv_input(self):
|
||||
# GH14867
|
||||
with read_csv(
|
||||
StringIO(), chunksize=20, header=None, names=["a", "b", "c"]
|
||||
) as df:
|
||||
assert isinstance(df, TextFileReader)
|
||||
|
||||
|
||||
def assert_array_dicts_equal(left, right):
|
||||
for k, v in left.items():
|
||||
tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k]))
|
||||
@@ -0,0 +1,226 @@
|
||||
"""
|
||||
Tests that features that are currently unsupported in
|
||||
either the Python or C parser are actually enforced
|
||||
and are clearly communicated to the user.
|
||||
|
||||
Ultimately, the goal is to remove test cases from this
|
||||
test suite as new feature support is added to the parsers.
|
||||
"""
|
||||
from io import StringIO
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import read_csv
|
||||
import pandas.io.parsers.readers as parsers
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
|
||||
def python_engine(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestUnsupportedFeatures:
|
||||
def test_mangle_dupe_cols_false(self):
|
||||
# see gh-12935
|
||||
data = "a b c\n1 2 3"
|
||||
|
||||
for engine in ("c", "python"):
|
||||
with pytest.raises(TypeError, match="unexpected keyword"):
|
||||
read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True)
|
||||
|
||||
def test_c_engine(self):
|
||||
# see gh-6607
|
||||
data = "a b c\n1 2 3"
|
||||
msg = "does not support"
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
# specify C engine with unsupported options (raise)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
|
||||
read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", sep=r"\s")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128))
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", skipfooter=1)
|
||||
|
||||
# specify C-unsupported options without python-unsupported options
|
||||
with tm.assert_produces_warning((parsers.ParserWarning, FutureWarning)):
|
||||
read_csv(StringIO(data), sep=None, delim_whitespace=False)
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep=r"\s")
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep="\t", quotechar=chr(128))
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), skipfooter=1)
|
||||
|
||||
text = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
msg = "Error tokenizing data"
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
read_csv(StringIO(text), sep="\\s+")
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
read_csv(StringIO(text), engine="c", sep="\\s+")
|
||||
|
||||
msg = "Only length-1 thousands markers supported"
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), thousands=",,")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), thousands="")
|
||||
|
||||
msg = "Only length-1 line terminators supported"
|
||||
data = "a,b,c~~1,2,3~~4,5,6"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), lineterminator="~~")
|
||||
|
||||
def test_python_engine(self, python_engine):
|
||||
from pandas.io.parsers.readers import _python_unsupported as py_unsupported
|
||||
|
||||
data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
|
||||
for default in py_unsupported:
|
||||
msg = (
|
||||
f"The {repr(default)} option is not "
|
||||
f"supported with the {repr(python_engine)} engine"
|
||||
)
|
||||
|
||||
kwargs = {default: object()}
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine=python_engine, **kwargs)
|
||||
|
||||
def test_python_engine_file_no_iter(self, python_engine):
|
||||
# see gh-16530
|
||||
class NoNextBuffer:
|
||||
def __init__(self, csv_data) -> None:
|
||||
self.data = csv_data
|
||||
|
||||
def __next__(self):
|
||||
return self.data.__next__()
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
def readline(self):
|
||||
return self.data
|
||||
|
||||
data = "a\n1"
|
||||
msg = "'NoNextBuffer' object is not iterable|argument 1 must be an iterator"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
read_csv(NoNextBuffer(data), engine=python_engine)
|
||||
|
||||
def test_pyarrow_engine(self):
|
||||
from pandas.io.parsers.readers import _pyarrow_unsupported as pa_unsupported
|
||||
|
||||
data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
|
||||
for default in pa_unsupported:
|
||||
msg = (
|
||||
f"The {repr(default)} option is not "
|
||||
f"supported with the 'pyarrow' engine"
|
||||
)
|
||||
kwargs = {default: object()}
|
||||
default_needs_bool = {"warn_bad_lines", "error_bad_lines"}
|
||||
if default == "dialect":
|
||||
kwargs[default] = "excel" # test a random dialect
|
||||
elif default in default_needs_bool:
|
||||
kwargs[default] = True
|
||||
elif default == "on_bad_lines":
|
||||
kwargs[default] = "warn"
|
||||
|
||||
warn = None
|
||||
depr_msg = None
|
||||
if "delim_whitespace" in kwargs:
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
warn = FutureWarning
|
||||
if "verbose" in kwargs:
|
||||
depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated"
|
||||
warn = FutureWarning
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(warn, match=depr_msg):
|
||||
read_csv(StringIO(data), engine="pyarrow", **kwargs)
|
||||
|
||||
def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
|
||||
# GH 5686
|
||||
# GH 54643
|
||||
sio = StringIO("a,b\n1,2")
|
||||
bad_lines_func = lambda x: x
|
||||
parser = all_parsers
|
||||
if all_parsers.engine not in ["python", "pyarrow"]:
|
||||
msg = (
|
||||
"on_bad_line can only be a callable "
|
||||
"function if engine='python' or 'pyarrow'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(sio, on_bad_lines=bad_lines_func)
|
||||
else:
|
||||
parser.read_csv(sio, on_bad_lines=bad_lines_func)
|
||||
|
||||
|
||||
def test_close_file_handle_on_invalid_usecols(all_parsers):
|
||||
# GH 45384
|
||||
parser = all_parsers
|
||||
|
||||
error = ValueError
|
||||
if parser.engine == "pyarrow":
|
||||
# Raises pyarrow.lib.ArrowKeyError
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with tm.ensure_clean("test.csv") as fname:
|
||||
Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8")
|
||||
with tm.assert_produces_warning(False):
|
||||
with pytest.raises(error, match="col3"):
|
||||
parser.read_csv(fname, usecols=["col1", "col2", "col3"])
|
||||
# unlink fails on windows if file handles still point to it
|
||||
os.unlink(fname)
|
||||
|
||||
|
||||
def test_invalid_file_inputs(request, all_parsers):
|
||||
# GH#45957
|
||||
parser = all_parsers
|
||||
if parser.engine == "python":
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.")
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid"):
|
||||
parser.read_csv([])
|
||||
|
||||
|
||||
def test_invalid_dtype_backend(all_parsers):
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
|
||||
"'pyarrow' are allowed."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv("test", dtype_backend="numpy")
|
||||
@@ -0,0 +1,102 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.parsers import (
|
||||
_maybe_upcast,
|
||||
na_values,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import NA
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import (
|
||||
ArrowStringArray,
|
||||
BooleanArray,
|
||||
FloatingArray,
|
||||
IntegerArray,
|
||||
StringArray,
|
||||
)
|
||||
|
||||
|
||||
def test_maybe_upcast(any_real_numpy_dtype):
|
||||
# GH#36712
|
||||
|
||||
dtype = np.dtype(any_real_numpy_dtype)
|
||||
na_value = na_values[dtype]
|
||||
arr = np.array([1, 2, na_value], dtype=dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([False, False, True])
|
||||
if issubclass(dtype.type, np.integer):
|
||||
expected = IntegerArray(arr, mask=expected_mask)
|
||||
else:
|
||||
expected = FloatingArray(arr, mask=expected_mask)
|
||||
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_maybe_upcast_no_na(any_real_numpy_dtype):
|
||||
# GH#36712
|
||||
arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([False, False, False])
|
||||
if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer):
|
||||
expected = IntegerArray(arr, mask=expected_mask)
|
||||
else:
|
||||
expected = FloatingArray(arr, mask=expected_mask)
|
||||
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_maybe_upcaste_bool():
|
||||
# GH#36712
|
||||
dtype = np.bool_
|
||||
na_value = na_values[dtype]
|
||||
arr = np.array([True, False, na_value], dtype="uint8").view(dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([False, False, True])
|
||||
expected = BooleanArray(arr, mask=expected_mask)
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_maybe_upcaste_bool_no_nan():
|
||||
# GH#36712
|
||||
dtype = np.bool_
|
||||
arr = np.array([True, False, False], dtype="uint8").view(dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([False, False, False])
|
||||
expected = BooleanArray(arr, mask=expected_mask)
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_maybe_upcaste_all_nan():
|
||||
# GH#36712
|
||||
dtype = np.int64
|
||||
na_value = na_values[dtype]
|
||||
arr = np.array([na_value, na_value], dtype=dtype)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
expected_mask = np.array([True, True])
|
||||
expected = IntegerArray(arr, mask=expected_mask)
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
|
||||
def test_maybe_upcast_object(val, string_storage):
|
||||
# GH#36712
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
|
||||
with pd.option_context("mode.string_storage", string_storage):
|
||||
arr = np.array(["a", "b", val], dtype=np.object_)
|
||||
result = _maybe_upcast(arr, use_dtype_backend=True)
|
||||
|
||||
if string_storage == "python":
|
||||
exp_val = "c" if val == "c" else NA
|
||||
expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_))
|
||||
else:
|
||||
exp_val = "c" if val == "c" else None
|
||||
expected = ArrowStringArray(pa.array(["a", "b", exp_val]))
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,194 @@
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
_msg_pyarrow_requires_names = (
|
||||
"The pyarrow engine does not allow 'usecols' to be integer column "
|
||||
"positions. Pass a list of string column names instead."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
||||
def test_usecols_with_parse_dates(all_parsers, usecols):
|
||||
# see gh-9755
|
||||
data = """a,b,c,d,e
|
||||
0,1,2014-01-01,09:00,4
|
||||
0,1,2014-01-02,10:00,4"""
|
||||
parser = all_parsers
|
||||
parse_dates = [[1, 2]]
|
||||
|
||||
depr_msg = (
|
||||
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
|
||||
)
|
||||
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
if parser.engine == "pyarrow":
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(
|
||||
StringIO(data), usecols=usecols, parse_dates=parse_dates
|
||||
)
|
||||
return
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), usecols=usecols, parse_dates=parse_dates
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns
|
||||
def test_usecols_with_parse_dates2(all_parsers):
|
||||
# see gh-13604
|
||||
parser = all_parsers
|
||||
data = """2008-02-07 09:40,1032.43
|
||||
2008-02-07 09:50,1042.54
|
||||
2008-02-07 10:00,1051.65"""
|
||||
|
||||
names = ["date", "values"]
|
||||
usecols = names[:]
|
||||
parse_dates = [0]
|
||||
|
||||
index = Index(
|
||||
[
|
||||
Timestamp("2008-02-07 09:40"),
|
||||
Timestamp("2008-02-07 09:50"),
|
||||
Timestamp("2008-02-07 10:00"),
|
||||
],
|
||||
name="date",
|
||||
)
|
||||
cols = {"values": [1032.43, 1042.54, 1051.65]}
|
||||
expected = DataFrame(cols, index=index)
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
parse_dates=parse_dates,
|
||||
index_col=0,
|
||||
usecols=usecols,
|
||||
header=None,
|
||||
names=names,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates3(all_parsers):
|
||||
# see gh-14792
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d,e,f,g,h,i,j
|
||||
2016/09/21,1,1,2,3,4,5,6,7,8"""
|
||||
|
||||
usecols = list("abcdefghij")
|
||||
parse_dates = [0]
|
||||
|
||||
cols = {
|
||||
"a": Timestamp("2016-09-21").as_unit("ns"),
|
||||
"b": [1],
|
||||
"c": [1],
|
||||
"d": [2],
|
||||
"e": [3],
|
||||
"f": [4],
|
||||
"g": [5],
|
||||
"h": [6],
|
||||
"i": [7],
|
||||
"j": [8],
|
||||
}
|
||||
expected = DataFrame(cols, columns=usecols)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates4(all_parsers):
|
||||
data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
|
||||
usecols = list("abcdefghij")
|
||||
parse_dates = [[0, 1]]
|
||||
parser = all_parsers
|
||||
|
||||
cols = {
|
||||
"a_b": "2016/09/21 1",
|
||||
"c": [1],
|
||||
"d": [2],
|
||||
"e": [3],
|
||||
"f": [4],
|
||||
"g": [5],
|
||||
"h": [6],
|
||||
"i": [7],
|
||||
"j": [8],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
|
||||
|
||||
depr_msg = (
|
||||
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(
|
||||
(FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=usecols,
|
||||
parse_dates=parse_dates,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
||||
@pytest.mark.parametrize(
|
||||
"names",
|
||||
[
|
||||
list("abcde"), # Names span all columns in original data.
|
||||
list("acd"), # Names span only the selected columns.
|
||||
],
|
||||
)
|
||||
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request):
|
||||
# see gh-9755
|
||||
s = """0,1,2014-01-01,09:00,4
|
||||
0,1,2014-01-02,10:00,4"""
|
||||
parse_dates = [[1, 2]]
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0):
|
||||
mark = pytest.mark.xfail(
|
||||
reason="Length mismatch in some cases, UserWarning in other"
|
||||
)
|
||||
request.applymarker(mark)
|
||||
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
|
||||
depr_msg = (
|
||||
"Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(
|
||||
(FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,96 @@
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
def test_usecols_with_unicode_strings(all_parsers):
|
||||
# see gh-13219
|
||||
data = """AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"AAA": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"BBB": {0: 8, 1: 2, 2: 7},
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_single_byte_unicode_strings(all_parsers):
|
||||
# see gh-13219
|
||||
data = """A,B,C,D
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"A": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"B": {0: 8, 1: 2, 2: 7},
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]])
|
||||
def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
|
||||
data = """AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
_msg_validate_usecols_arg = (
|
||||
"'usecols' must either be list-like "
|
||||
"of all strings, all unicode, all "
|
||||
"integers or a callable."
|
||||
)
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]])
|
||||
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
|
||||
data = """あああ,いい,ううう,ええええ
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"あああ": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"いい": {0: 8, 1: 2, 2: 7},
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,563 @@
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
array,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
_msg_validate_usecols_arg = (
|
||||
"'usecols' must either be list-like "
|
||||
"of all strings, all unicode, all "
|
||||
"integers or a callable."
|
||||
)
|
||||
_msg_validate_usecols_names = (
|
||||
"Usecols do not match columns, columns expected but not found: {0}"
|
||||
)
|
||||
_msg_pyarrow_requires_names = (
|
||||
"The pyarrow engine does not allow 'usecols' to be integer column "
|
||||
"positions. Pass a list of string column names instead."
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
def test_raise_on_mixed_dtype_usecols(all_parsers):
|
||||
# See gh-12678
|
||||
data = """a,b,c
|
||||
1000,2000,3000
|
||||
4000,5000,6000
|
||||
"""
|
||||
usecols = [0, "b", 2]
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
|
||||
def test_usecols(all_parsers, usecols, request):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_names(all_parsers):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
names = ["foo", "bar"]
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
|
||||
)
|
||||
def test_usecols_relative_to_names(all_parsers, names, usecols):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow" and not isinstance(usecols[0], int):
|
||||
# ArrowKeyError: Column 'fb' in include_columns does not exist
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_relative_to_names2(all_parsers):
|
||||
# see gh-5766
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), names=["a", "b"], header=None, usecols=[0, 1]
|
||||
)
|
||||
|
||||
expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# regex mismatch: "Length mismatch: Expected axis has 1 elements"
|
||||
@xfail_pyarrow
|
||||
def test_usecols_name_length_conflict(all_parsers):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
msg = "Number of passed names did not match number of header fields in the file"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
|
||||
|
||||
|
||||
def test_usecols_single_string(all_parsers):
|
||||
# see gh-20558
|
||||
parser = all_parsers
|
||||
data = """foo, bar, baz
|
||||
1000, 2000, 3000
|
||||
4000, 5000, 6000"""
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols="foo")
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error in one case, AttributeError in another
|
||||
@pytest.mark.parametrize(
|
||||
"data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
|
||||
)
|
||||
def test_usecols_index_col_false(all_parsers, data):
|
||||
# see gh-9082
|
||||
parser = all_parsers
|
||||
usecols = ["a", "c", "d"]
|
||||
expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", ["b", 0])
|
||||
@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
|
||||
def test_usecols_index_col_conflict(all_parsers, usecols, index_col, request):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
parser = all_parsers
|
||||
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
||||
|
||||
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
|
||||
return
|
||||
|
||||
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_index_col_conflict2(all_parsers):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
parser = all_parsers
|
||||
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
||||
|
||||
expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
|
||||
expected = expected.set_index(["b", "c"])
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
|
||||
def test_usecols_implicit_index_col(all_parsers):
|
||||
# see gh-2654
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["a", "b"])
|
||||
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_index_col_middle(all_parsers):
|
||||
# GH#9098
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
1,2,3,4
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="c")
|
||||
expected = DataFrame({"b": [2], "d": [4]}, index=Index([3], name="c"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_index_col_end(all_parsers):
|
||||
# GH#9098
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
1,2,3,4
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="d")
|
||||
expected = DataFrame({"b": [2], "c": [3]}, index=Index([4], name="d"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_regex_sep(all_parsers):
|
||||
# see gh-2733
|
||||
parser = all_parsers
|
||||
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
|
||||
|
||||
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_whitespace(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(
|
||||
StringIO(data), delim_whitespace=True, usecols=("a", "b")
|
||||
)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), delim_whitespace=True, usecols=("a", "b")
|
||||
)
|
||||
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols,expected",
|
||||
[
|
||||
# Column selection by index.
|
||||
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
|
||||
# Column selection by name.
|
||||
(
|
||||
["0", "1"],
|
||||
DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_usecols_with_integer_like_header(all_parsers, usecols, expected, request):
|
||||
parser = all_parsers
|
||||
data = """2,0,1
|
||||
1000,2000,3000
|
||||
4000,5000,6000"""
|
||||
|
||||
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # mismatched shape
|
||||
def test_empty_usecols(all_parsers):
|
||||
data = "a,b,c\n1,2,3\n4,5,6"
|
||||
expected = DataFrame(columns=Index([]))
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=set())
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_np_array_usecols(all_parsers):
|
||||
# see gh-12546
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,3"
|
||||
usecols = np.array(["a", "b"])
|
||||
|
||||
expected = DataFrame([[1, 2]], columns=usecols)
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols,expected",
|
||||
[
|
||||
(
|
||||
lambda x: x.upper() in ["AAA", "BBB", "DDD"],
|
||||
DataFrame(
|
||||
{
|
||||
"AaA": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"bBb": {0: 8, 1: 2, 2: 7},
|
||||
"ddd": {0: "a", 1: "b", 2: "a"},
|
||||
}
|
||||
),
|
||||
),
|
||||
(lambda x: False, DataFrame(columns=Index([]))),
|
||||
],
|
||||
)
|
||||
def test_callable_usecols(all_parsers, usecols, expected):
|
||||
# see gh-14154
|
||||
data = """AaA,bBb,CCC,ddd
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# ArrowKeyError: Column 'fa' in include_columns does not exist in CSV file
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
|
||||
def test_incomplete_first_row(all_parsers, usecols):
|
||||
# see gh-6710
|
||||
data = "1,2\n1,2,3"
|
||||
parser = all_parsers
|
||||
names = ["a", "b", "c"]
|
||||
expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Expected 3 columns, got 4
|
||||
@pytest.mark.parametrize(
|
||||
"data,usecols,kwargs,expected",
|
||||
[
|
||||
# see gh-8985
|
||||
(
|
||||
"19,29,39\n" * 2 + "10,20,30,40",
|
||||
[0, 1, 2],
|
||||
{"header": None},
|
||||
DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]),
|
||||
),
|
||||
# see gh-9549
|
||||
(
|
||||
("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"),
|
||||
["A", "B", "C"],
|
||||
{},
|
||||
DataFrame(
|
||||
{
|
||||
"A": [1, 3, 1, 1, 1, 5],
|
||||
"B": [2, 4, 2, 2, 2, 6],
|
||||
"C": [3, 5, 4, 3, 3, 7],
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
|
||||
# see gh-8985
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols,kwargs,expected,msg",
|
||||
[
|
||||
(
|
||||
["a", "b", "c", "d"],
|
||||
{},
|
||||
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
|
||||
None,
|
||||
),
|
||||
(
|
||||
["a", "b", "c", "f"],
|
||||
{},
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]"),
|
||||
),
|
||||
(["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")),
|
||||
(
|
||||
["a", "b", "f", "g"],
|
||||
{},
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"),
|
||||
),
|
||||
# see gh-14671
|
||||
(
|
||||
None,
|
||||
{"header": 0, "names": ["A", "B", "C", "D"]},
|
||||
DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}),
|
||||
None,
|
||||
),
|
||||
(
|
||||
["A", "B", "C", "f"],
|
||||
{"header": 0, "names": ["A", "B", "C", "D"]},
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]"),
|
||||
),
|
||||
(
|
||||
["A", "B", "f"],
|
||||
{"names": ["A", "B", "C", "D"]},
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_raises_on_usecols_names_mismatch(
|
||||
all_parsers, usecols, kwargs, expected, msg, request
|
||||
):
|
||||
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
kwargs.update(usecols=usecols)
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and not (
|
||||
usecols is not None and expected is not None
|
||||
):
|
||||
# everything but the first case
|
||||
# ArrowKeyError: Column 'f' in include_columns does not exist in CSV file
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
if expected is None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
|
||||
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
|
||||
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
names = ["A", "B", "C", "D"]
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
if isinstance(usecols[0], int):
|
||||
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
|
||||
parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
|
||||
return
|
||||
# "pyarrow.lib.ArrowKeyError: Column 'A' in include_columns does not exist"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
|
||||
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("names", [None, ["a", "b"]])
|
||||
def test_usecols_indices_out_of_bounds(all_parsers, names):
|
||||
# GH#25623 & GH 41130; enforced in 2.0
|
||||
parser = all_parsers
|
||||
data = """
|
||||
a,b
|
||||
1,2
|
||||
"""
|
||||
|
||||
err = ParserError
|
||||
msg = "Defining usecols with out-of-bounds"
|
||||
if parser.engine == "pyarrow":
|
||||
err = ValueError
|
||||
msg = _msg_pyarrow_requires_names
|
||||
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
|
||||
|
||||
|
||||
def test_usecols_additional_columns(all_parsers):
|
||||
# GH#46997
|
||||
parser = all_parsers
|
||||
usecols = lambda header: header.strip() in ["a", "b", "c"]
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
|
||||
return
|
||||
result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
|
||||
expected = DataFrame({"a": ["x"], "b": "y"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_additional_columns_integer_columns(all_parsers):
|
||||
# GH#46997
|
||||
parser = all_parsers
|
||||
usecols = lambda header: header.strip() in ["0", "1"]
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
|
||||
return
|
||||
result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
|
||||
expected = DataFrame({"0": ["x"], "1": "y"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """
|
||||
col1,col2,col3
|
||||
a,1,x
|
||||
b,2,y
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=["col1", "col2"],
|
||||
dtype={"col1": "string", "col2": "uint8", "col3": "string"},
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
Reference in New Issue
Block a user