Initial Debugging Completed and Execution Successful
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,382 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import parsers as libparsers
|
||||
from pandas.errors import DtypeWarning
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [0, "index"])
|
||||
def test_read_chunksize_with_index(all_parsers, index_col):
|
||||
parser = all_parsers
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["foo", 2, 3, 4, 5],
|
||||
["bar", 7, 8, 9, 10],
|
||||
["baz", 12, 13, 14, 15],
|
||||
["qux", 12, 13, 14, 15],
|
||||
["foo2", 12, 13, 14, 15],
|
||||
["bar2", 12, 13, 14, 15],
|
||||
],
|
||||
columns=["index", "A", "B", "C", "D"],
|
||||
)
|
||||
expected = expected.set_index("index")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
|
||||
list(reader)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
|
||||
chunks = list(reader)
|
||||
tm.assert_frame_equal(chunks[0], expected[:2])
|
||||
tm.assert_frame_equal(chunks[1], expected[2:4])
|
||||
tm.assert_frame_equal(chunks[2], expected[4:])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
|
||||
def test_read_chunksize_bad(all_parsers, chunksize):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
msg = r"'chunksize' must be an integer >=1"
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [2, 8])
|
||||
def test_read_chunksize_and_nrows(all_parsers, chunksize):
|
||||
# see gh-15755
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0, "nrows": 5}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
|
||||
tm.assert_frame_equal(concat(reader), expected)
|
||||
|
||||
|
||||
def test_read_chunksize_and_nrows_changing_size(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0, "nrows": 5}
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
|
||||
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
|
||||
tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
|
||||
|
||||
with pytest.raises(StopIteration, match=""):
|
||||
reader.get_chunk(size=3)
|
||||
|
||||
|
||||
def test_get_chunk_passed_chunksize(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
1,2,3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), chunksize=2) as reader:
|
||||
reader.get_chunk()
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), chunksize=2) as reader:
|
||||
result = reader.get_chunk()
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
|
||||
def test_read_chunksize_compat(all_parsers, kwargs):
|
||||
# see gh-12185
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
|
||||
concat(reader)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
|
||||
via_reader = concat(reader)
|
||||
tm.assert_frame_equal(via_reader, result)
|
||||
|
||||
|
||||
def test_read_chunksize_jagged_names(all_parsers):
|
||||
# see gh-23509
|
||||
parser = all_parsers
|
||||
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
|
||||
|
||||
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(
|
||||
StringIO(data), names=range(10), chunksize=4
|
||||
) as reader:
|
||||
concat(reader)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
|
||||
result = concat(reader)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_chunk_begins_with_newline_whitespace(all_parsers):
|
||||
# see gh-10022
|
||||
parser = all_parsers
|
||||
data = "\n hello\nworld\n"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = DataFrame([" hello", "world"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
|
||||
# mainly an issue with the C parser
|
||||
heuristic = 2**3
|
||||
parser = all_parsers
|
||||
integers = [str(i) for i in range(heuristic - 1)]
|
||||
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
|
||||
|
||||
# Coercions should work without warnings.
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
assert type(result.a[0]) is np.float64
|
||||
assert result.a.dtype == float
|
||||
|
||||
|
||||
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
|
||||
warning_type = None
|
||||
parser = all_parsers
|
||||
size = 10000
|
||||
|
||||
# see gh-3866: if chunks are different types and can't
|
||||
# be coerced using numerical types, then issue warning.
|
||||
if parser.engine == "c" and parser.low_memory:
|
||||
warning_type = DtypeWarning
|
||||
# Use larger size to hit warning path
|
||||
size = 499999
|
||||
|
||||
integers = [str(i) for i in range(size)]
|
||||
data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
|
||||
|
||||
buf = StringIO(data)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
df = parser.read_csv(
|
||||
buf,
|
||||
)
|
||||
else:
|
||||
df = parser.read_csv_check_warnings(
|
||||
warning_type,
|
||||
r"Columns \(0\) have mixed types. "
|
||||
"Specify dtype option on import or set low_memory=False.",
|
||||
buf,
|
||||
)
|
||||
if parser.engine == "c" and parser.low_memory:
|
||||
assert df.a.dtype == object
|
||||
elif using_infer_string:
|
||||
assert df.a.dtype == "str"
|
||||
else:
|
||||
assert df.a.dtype == object
|
||||
|
||||
|
||||
@pytest.mark.parametrize("iterator", [True, False])
|
||||
def test_empty_with_nrows_chunksize(all_parsers, iterator):
|
||||
# see gh-9535
|
||||
parser = all_parsers
|
||||
expected = DataFrame(columns=["foo", "bar"])
|
||||
|
||||
nrows = 10
|
||||
data = StringIO("foo,bar\n")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
if iterator:
|
||||
with parser.read_csv(data, chunksize=nrows) as reader:
|
||||
next(iter(reader))
|
||||
else:
|
||||
parser.read_csv(data, nrows=nrows)
|
||||
return
|
||||
|
||||
if iterator:
|
||||
with parser.read_csv(data, chunksize=nrows) as reader:
|
||||
result = next(iter(reader))
|
||||
else:
|
||||
result = parser.read_csv(data, nrows=nrows)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_memory_growth_chunksize(all_parsers):
|
||||
# see gh-24805
|
||||
#
|
||||
# Let's just make sure that we don't crash
|
||||
# as we iteratively process all chunks.
|
||||
parser = all_parsers
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
for i in range(1000):
|
||||
f.write(str(i) + "\n")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(path, chunksize=20) as result:
|
||||
for _ in result:
|
||||
pass
|
||||
return
|
||||
|
||||
with parser.read_csv(path, chunksize=20) as result:
|
||||
for _ in result:
|
||||
pass
|
||||
|
||||
|
||||
def test_chunksize_with_usecols_second_block_shorter(all_parsers):
|
||||
# GH#21211
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4
|
||||
5,6,7,8
|
||||
9,10,11
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
names=["a", "b"],
|
||||
chunksize=2,
|
||||
usecols=[0, 1],
|
||||
header=None,
|
||||
)
|
||||
return
|
||||
|
||||
result_chunks = parser.read_csv(
|
||||
StringIO(data),
|
||||
names=["a", "b"],
|
||||
chunksize=2,
|
||||
usecols=[0, 1],
|
||||
header=None,
|
||||
)
|
||||
|
||||
expected_frames = [
|
||||
DataFrame({"a": [1, 5], "b": [2, 6]}),
|
||||
DataFrame({"a": [9], "b": [10]}, index=[2]),
|
||||
]
|
||||
|
||||
for i, result in enumerate(result_chunks):
|
||||
tm.assert_frame_equal(result, expected_frames[i])
|
||||
|
||||
|
||||
def test_chunksize_second_block_shorter(all_parsers):
|
||||
# GH#21211
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
1,2,3,4
|
||||
5,6,7,8
|
||||
9,10,11
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), chunksize=2)
|
||||
return
|
||||
|
||||
result_chunks = parser.read_csv(StringIO(data), chunksize=2)
|
||||
|
||||
expected_frames = [
|
||||
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
|
||||
DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]),
|
||||
]
|
||||
|
||||
for i, result in enumerate(result_chunks):
|
||||
tm.assert_frame_equal(result, expected_frames[i])
|
||||
@@ -0,0 +1,983 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from datetime import datetime
|
||||
from inspect import signature
|
||||
from io import StringIO
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._config import using_string_dtype
|
||||
|
||||
from pandas.compat import HAS_PYARROW
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Timestamp,
|
||||
compat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import TextFileReader
|
||||
from pandas.io.parsers.c_parser_wrapper import CParserWrapper
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_override_set_noconvert_columns():
|
||||
# see gh-17351
|
||||
#
|
||||
# Usecols needs to be sorted in _set_noconvert_columns based
|
||||
# on the test_usecols_with_parse_dates test from test_usecols.py
|
||||
class MyTextFileReader(TextFileReader):
|
||||
def __init__(self) -> None:
|
||||
self._currow = 0
|
||||
self.squeeze = False
|
||||
|
||||
class MyCParserWrapper(CParserWrapper):
|
||||
def _set_noconvert_columns(self):
|
||||
if self.usecols_dtype == "integer":
|
||||
# self.usecols is a set, which is documented as unordered
|
||||
# but in practice, a CPython set of integers is sorted.
|
||||
# In other implementations this assumption does not hold.
|
||||
# The following code simulates a different order, which
|
||||
# before GH 17351 would cause the wrong columns to be
|
||||
# converted via the parse_dates parameter
|
||||
self.usecols = list(self.usecols)
|
||||
self.usecols.reverse()
|
||||
return CParserWrapper._set_noconvert_columns(self)
|
||||
|
||||
data = """a,b,c,d,e
|
||||
0,1,2014-01-01,09:00,4
|
||||
0,1,2014-01-02,10:00,4"""
|
||||
|
||||
parse_dates = [[1, 2]]
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
|
||||
parser = MyTextFileReader()
|
||||
parser.options = {
|
||||
"usecols": [0, 2, 3],
|
||||
"parse_dates": parse_dates,
|
||||
"delimiter": ",",
|
||||
}
|
||||
parser.engine = "c"
|
||||
parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
|
||||
|
||||
result = parser.read()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_local(all_parsers, csv1):
|
||||
prefix = "file:///" if compat.is_platform_windows() else "file://"
|
||||
parser = all_parsers
|
||||
|
||||
fname = prefix + str(os.path.abspath(csv1))
|
||||
result = parser.read_csv(fname, index_col=0, parse_dates=True)
|
||||
# TODO: make unit check more specific
|
||||
if parser.engine == "pyarrow":
|
||||
result.index = result.index.as_unit("ns")
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.980269, 3.685731, -0.364216805298, -1.159738],
|
||||
[1.047916, -0.041232, -0.16181208307, 0.212549],
|
||||
[0.498581, 0.731168, -0.537677223318, 1.346270],
|
||||
[1.120202, 1.567621, 0.00364077397681, 0.675253],
|
||||
[-0.487094, 0.571455, -1.6116394093, 0.103469],
|
||||
[0.836649, 0.246462, 0.588542635376, 1.062782],
|
||||
[-0.157161, 1.340307, 1.1957779562, -1.097007],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(
|
||||
[
|
||||
datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 4),
|
||||
datetime(2000, 1, 5),
|
||||
datetime(2000, 1, 6),
|
||||
datetime(2000, 1, 7),
|
||||
datetime(2000, 1, 10),
|
||||
datetime(2000, 1, 11),
|
||||
],
|
||||
name="index",
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_1000_sep(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep="|", thousands=",")
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep="|", thousands=",")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Found non-unique column index
|
||||
def test_unnamed_columns(all_parsers):
|
||||
data = """A,B,C,,
|
||||
1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
|
||||
dtype=np.int64,
|
||||
columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"],
|
||||
)
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_csv_mixed_type(all_parsers):
|
||||
data = """A,B,C
|
||||
a,1,2
|
||||
b,3,4
|
||||
c,4,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]})
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_low_memory_no_rows_with_index(all_parsers):
|
||||
# see gh-21141
|
||||
parser = all_parsers
|
||||
|
||||
if not parser.low_memory:
|
||||
pytest.skip("This is a low-memory specific test")
|
||||
|
||||
data = """A,B,C
|
||||
1,1,1,2
|
||||
2,2,3,4
|
||||
3,3,4,5
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
|
||||
expected = DataFrame(columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_dataframe(all_parsers, csv1):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(csv1, index_col=0, parse_dates=True)
|
||||
# TODO: make unit check more specific
|
||||
if parser.engine == "pyarrow":
|
||||
result.index = result.index.as_unit("ns")
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.980269, 3.685731, -0.364216805298, -1.159738],
|
||||
[1.047916, -0.041232, -0.16181208307, 0.212549],
|
||||
[0.498581, 0.731168, -0.537677223318, 1.346270],
|
||||
[1.120202, 1.567621, 0.00364077397681, 0.675253],
|
||||
[-0.487094, 0.571455, -1.6116394093, 0.103469],
|
||||
[0.836649, 0.246462, 0.588542635376, 1.062782],
|
||||
[-0.157161, 1.340307, 1.1957779562, -1.097007],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(
|
||||
[
|
||||
datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 4),
|
||||
datetime(2000, 1, 5),
|
||||
datetime(2000, 1, 6),
|
||||
datetime(2000, 1, 7),
|
||||
datetime(2000, 1, 10),
|
||||
datetime(2000, 1, 11),
|
||||
],
|
||||
name="index",
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [3, 3.0])
|
||||
def test_read_nrows(all_parsers, nrows):
|
||||
# see gh-10476
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]],
|
||||
columns=["index", "A", "B", "C", "D"],
|
||||
)
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), nrows=nrows)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), nrows=nrows)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [1.2, "foo", -1])
|
||||
def test_read_nrows_bad(all_parsers, nrows):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
msg = r"'nrows' must be an integer >=0"
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), nrows=nrows)
|
||||
|
||||
|
||||
def test_nrows_skipfooter_errors(all_parsers):
|
||||
msg = "'skipfooter' not supported with 'nrows'"
|
||||
data = "a\n1\n2\n3\n4\n5\n6"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_missing_trailing_delimiters(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
1,2,3,4
|
||||
1,3,3,
|
||||
1,4,5"""
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]],
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skip_initial_space(all_parsers):
|
||||
data = (
|
||||
'"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
|
||||
"1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, "
|
||||
"314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, "
|
||||
"70.06056, 344.98370, 1, 1, -0.689265, -0.692787, "
|
||||
"0.212036, 14.7674, 41.605, -9999.0, -9999.0, "
|
||||
"-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128"
|
||||
)
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
names=list(range(33)),
|
||||
header=None,
|
||||
na_values=["-9999.0"],
|
||||
skipinitialspace=True,
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
names=list(range(33)),
|
||||
header=None,
|
||||
na_values=["-9999.0"],
|
||||
skipinitialspace=True,
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[
|
||||
"09-Apr-2012",
|
||||
"01:10:18.300",
|
||||
2456026.548822908,
|
||||
12849,
|
||||
1.00361,
|
||||
1.12551,
|
||||
330.65659,
|
||||
355626618.16711,
|
||||
73.48821,
|
||||
314.11625,
|
||||
1917.09447,
|
||||
179.71425,
|
||||
80.0,
|
||||
240.0,
|
||||
-350,
|
||||
70.06056,
|
||||
344.9837,
|
||||
1,
|
||||
1,
|
||||
-0.689265,
|
||||
-0.692787,
|
||||
0.212036,
|
||||
14.7674,
|
||||
41.605,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
np.nan,
|
||||
0,
|
||||
12,
|
||||
128,
|
||||
]
|
||||
]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_trailing_delimiters(all_parsers):
|
||||
# see gh-2442
|
||||
data = """A,B,C
|
||||
1,2,3,
|
||||
4,5,6,
|
||||
7,8,9,"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=False)
|
||||
|
||||
expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_escapechar(all_parsers):
|
||||
# https://stackoverflow.com/questions/13824840/feature-request-for-
|
||||
# pandas-read-csv
|
||||
data = '''SEARCH_TERM,ACTUAL_URL
|
||||
"bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
|
||||
"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
|
||||
"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"'''
|
||||
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8"
|
||||
)
|
||||
|
||||
assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series'
|
||||
|
||||
tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"]))
|
||||
|
||||
|
||||
def test_ignore_leading_whitespace(all_parsers):
|
||||
# see gh-3374, gh-6607
|
||||
parser = all_parsers
|
||||
data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
return
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
|
||||
expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
|
||||
def test_uneven_lines_with_usecols(all_parsers, usecols):
|
||||
# see gh-12203
|
||||
parser = all_parsers
|
||||
data = r"""a,b,c
|
||||
0,1,2
|
||||
3,4,5,6,7
|
||||
8,9,10"""
|
||||
|
||||
if usecols is None:
|
||||
# Make sure that an error is still raised
|
||||
# when the "usecols" parameter is not provided.
|
||||
msg = r"Expected \d+ fields in line \d+, saw \d+"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
else:
|
||||
expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
# First, check to see that the response of parser when faced with no
|
||||
# provided columns raises the correct error, with or without usecols.
|
||||
("", {}, None),
|
||||
("", {"usecols": ["X"]}, None),
|
||||
(
|
||||
",,",
|
||||
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
|
||||
DataFrame(columns=["X"], index=[0], dtype=np.float64),
|
||||
),
|
||||
(
|
||||
"",
|
||||
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
|
||||
DataFrame(columns=["X"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
|
||||
# see gh-12493
|
||||
parser = all_parsers
|
||||
|
||||
if expected is None:
|
||||
msg = "No columns to parse from file"
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
# gh-8661, gh-8679: this should ignore six lines, including
|
||||
# lines with trailing whitespace and blank lines.
|
||||
(
|
||||
{
|
||||
"header": None,
|
||||
"delim_whitespace": True,
|
||||
"skiprows": [0, 1, 2, 3, 5, 6],
|
||||
"skip_blank_lines": True,
|
||||
},
|
||||
DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]),
|
||||
),
|
||||
# gh-8983: test skipping set of rows after a row with trailing spaces.
|
||||
(
|
||||
{
|
||||
"delim_whitespace": True,
|
||||
"skiprows": [1, 2, 3, 5, 6],
|
||||
"skip_blank_lines": True,
|
||||
},
|
||||
DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_trailing_spaces(all_parsers, kwargs, expected):
|
||||
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
|
||||
parser = all_parsers
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_raise_on_sep_with_delim_whitespace(all_parsers):
|
||||
# see gh-6607
|
||||
data = "a b c\n1 2 3"
|
||||
parser = all_parsers
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with pytest.raises(ValueError, match="you can only specify one"):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
|
||||
|
||||
|
||||
def test_read_filepath_or_buffer(all_parsers):
|
||||
# see gh-43366
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match="Expected file path name or file-like"):
|
||||
parser.read_csv(filepath_or_buffer=b"input")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("delim_whitespace", [True, False])
|
||||
def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
|
||||
# see gh-9710
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b\n"""
|
||||
|
||||
expected = DataFrame({"MyColumn": list("abab")})
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
skipinitialspace=True,
|
||||
delim_whitespace=delim_whitespace,
|
||||
)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sep,skip_blank_lines,exp_data",
|
||||
[
|
||||
(",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
|
||||
(r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
|
||||
(
|
||||
",",
|
||||
False,
|
||||
[
|
||||
[1.0, 2.0, 4.0],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[5.0, np.nan, 10.0],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[-70.0, 0.4, 1.0],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
|
||||
|
||||
5.,NaN,10.0
|
||||
|
||||
-70,.4,1
|
||||
"""
|
||||
|
||||
if sep == r"\s+":
|
||||
data = data.replace(",", " ")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines)
|
||||
expected = DataFrame(exp_data, columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_whitespace_lines(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """
|
||||
|
||||
\t \t\t
|
||||
\t
|
||||
A,B,C
|
||||
\t 1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"])
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
(
|
||||
""" A B C D
|
||||
a 1 2 3 4
|
||||
b 1 2 3 4
|
||||
c 1 2 3 4
|
||||
""",
|
||||
DataFrame(
|
||||
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=["a", "b", "c"],
|
||||
),
|
||||
),
|
||||
(
|
||||
" a b c\n1 2 3 \n4 5 6\n 7 8 9",
|
||||
DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_whitespace_regex_separator(all_parsers, data, expected):
|
||||
# see gh-6607
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_sub_character(all_parsers, csv_dir_path):
|
||||
# see gh-16893
|
||||
filename = os.path.join(csv_dir_path, "sub_char.csv")
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
|
||||
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(filename)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"])
|
||||
def test_filename_with_special_chars(all_parsers, filename):
|
||||
# see gh-15086.
|
||||
parser = all_parsers
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
|
||||
with tm.ensure_clean(filename) as path:
|
||||
df.to_csv(path, index=False)
|
||||
|
||||
result = parser.read_csv(path)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_read_table_same_signature_as_read_csv(all_parsers):
|
||||
# GH-34976
|
||||
parser = all_parsers
|
||||
|
||||
table_sign = signature(parser.read_table)
|
||||
csv_sign = signature(parser.read_csv)
|
||||
|
||||
assert table_sign.parameters.keys() == csv_sign.parameters.keys()
|
||||
assert table_sign.return_annotation == csv_sign.return_annotation
|
||||
|
||||
for key, csv_param in csv_sign.parameters.items():
|
||||
table_param = table_sign.parameters[key]
|
||||
if key == "sep":
|
||||
assert csv_param.default == ","
|
||||
assert table_param.default == "\t"
|
||||
assert table_param.annotation == csv_param.annotation
|
||||
assert table_param.kind == csv_param.kind
|
||||
continue
|
||||
|
||||
assert table_param == csv_param
|
||||
|
||||
|
||||
def test_read_table_equivalency_to_read_csv(all_parsers):
|
||||
# see gh-21948
|
||||
# As of 0.25.0, read_table is undeprecated
|
||||
parser = all_parsers
|
||||
data = "a\tb\n1\t2\n3\t4"
|
||||
expected = parser.read_csv(StringIO(data), sep="\t")
|
||||
result = parser.read_table(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("read_func", ["read_csv", "read_table"])
|
||||
def test_read_csv_and_table_sys_setprofile(all_parsers, read_func):
|
||||
# GH#41069
|
||||
parser = all_parsers
|
||||
data = "a b\n0 1"
|
||||
|
||||
sys.setprofile(lambda *a, **k: None)
|
||||
result = getattr(parser, read_func)(StringIO(data))
|
||||
sys.setprofile(None)
|
||||
|
||||
expected = DataFrame({"a b": ["0 1"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_first_row_bom(all_parsers):
|
||||
# see gh-26545
|
||||
parser = all_parsers
|
||||
data = '''\ufeff"Head1"\t"Head2"\t"Head3"'''
|
||||
|
||||
result = parser.read_csv(StringIO(data), delimiter="\t")
|
||||
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_first_row_bom_unquoted(all_parsers):
|
||||
# see gh-36343
|
||||
parser = all_parsers
|
||||
data = """\ufeffHead1\tHead2\tHead3"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), delimiter="\t")
|
||||
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", range(1, 6))
|
||||
def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
|
||||
# GH 28071
|
||||
ref = DataFrame(
|
||||
[[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
|
||||
columns=list("ab"),
|
||||
)
|
||||
csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False
|
||||
)
|
||||
return
|
||||
|
||||
df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
|
||||
tm.assert_frame_equal(df, ref[:nrows])
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_no_header_two_extra_columns(all_parsers):
|
||||
# GH 26218
|
||||
column_names = ["one", "two", "three"]
|
||||
ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
|
||||
stream = StringIO("foo,bar,baz,bam,blah")
|
||||
parser = all_parsers
|
||||
df = parser.read_csv_check_warnings(
|
||||
ParserWarning,
|
||||
"Length of header or names does not match length of data. "
|
||||
"This leads to a loss of data with index_col=False.",
|
||||
stream,
|
||||
header=None,
|
||||
names=column_names,
|
||||
index_col=False,
|
||||
)
|
||||
tm.assert_frame_equal(df, ref)
|
||||
|
||||
|
||||
def test_read_csv_names_not_accepting_sets(all_parsers):
|
||||
# GH 34946
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6\n"""
|
||||
parser = all_parsers
|
||||
with pytest.raises(ValueError, match="Names should be an ordered collection."):
|
||||
parser.read_csv(StringIO(data), names=set("QAZ"))
|
||||
|
||||
|
||||
def test_read_table_delim_whitespace_default_sep(all_parsers):
|
||||
# GH: 35958
|
||||
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||||
parser = all_parsers
|
||||
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_table(f, delim_whitespace=True)
|
||||
return
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_table(f, delim_whitespace=True)
|
||||
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("delimiter", [",", "\t"])
|
||||
def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
|
||||
# GH: 35958
|
||||
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Specified a delimiter with both sep and "
|
||||
"delim_whitespace=True; you can only specify one."
|
||||
)
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(f, delim_whitespace=True, sep=delimiter)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
|
||||
|
||||
|
||||
def test_read_csv_delimiter_and_sep_no_default(all_parsers):
|
||||
# GH#39823
|
||||
f = StringIO("a,b\n1,2")
|
||||
parser = all_parsers
|
||||
msg = "Specified a sep and a delimiter; you can only specify one."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(f, sep=" ", delimiter=".")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{"delimiter": "\n"}, {"sep": "\n"}])
|
||||
def test_read_csv_line_break_as_separator(kwargs, all_parsers):
|
||||
# GH#43528
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,2,3
|
||||
"""
|
||||
msg = (
|
||||
r"Specified \\n as separator or delimiter. This forces the python engine "
|
||||
r"which does not accept a line terminator. Hence it is not allowed to use "
|
||||
r"the line terminator as separator."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("delimiter", [",", "\t"])
|
||||
def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
|
||||
# GH: 35958
|
||||
f = StringIO("a b c\n1 -2 -3\n4 5 6")
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Specified a delimiter with both sep and "
|
||||
"delim_whitespace=True; you can only specify one."
|
||||
)
|
||||
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_table(f, delim_whitespace=True, sep=delimiter)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_dict_keys_as_names(all_parsers):
|
||||
# GH: 36928
|
||||
data = "1,2"
|
||||
|
||||
keys = {"a": int, "b": int}.keys()
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=keys)
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
|
||||
@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0
|
||||
def test_encoding_surrogatepass(all_parsers):
|
||||
# GH39017
|
||||
parser = all_parsers
|
||||
content = b"\xed\xbd\xbf"
|
||||
decoded = content.decode("utf-8", errors="surrogatepass")
|
||||
expected = DataFrame({decoded: [decoded]}, index=[decoded * 2])
|
||||
expected.index.name = decoded * 2
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
Path(path).write_bytes(
|
||||
content * 2 + b"," + content + b"\n" + content * 2 + b"," + content
|
||||
)
|
||||
df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"):
|
||||
parser.read_csv(path)
|
||||
|
||||
|
||||
def test_malformed_second_line(all_parsers):
|
||||
# see GH14782
|
||||
parser = all_parsers
|
||||
data = "\na\nb\n"
|
||||
result = parser.read_csv(StringIO(data), skip_blank_lines=False, header=1)
|
||||
expected = DataFrame({"a": ["b"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_short_single_line(all_parsers):
|
||||
# GH 47566
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
data = "1,2"
|
||||
result = parser.read_csv(StringIO(data), header=None, names=columns)
|
||||
expected = DataFrame({"a": [1], "b": [2], "c": [np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # ValueError: Length mismatch: Expected axis has 2 elements
|
||||
def test_short_multi_line(all_parsers):
|
||||
# GH 47566
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
data = "1,2\n1,2"
|
||||
result = parser.read_csv(StringIO(data), header=None, names=columns)
|
||||
expected = DataFrame({"a": [1, 1], "b": [2, 2], "c": [np.nan, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_seek(all_parsers):
|
||||
# GH48646
|
||||
parser = all_parsers
|
||||
prefix = "### DATA\n"
|
||||
content = "nkey,value\ntables,rectangular\n"
|
||||
with tm.ensure_clean() as path:
|
||||
Path(path).write_text(prefix + content, encoding="utf-8")
|
||||
with open(path, encoding="utf-8") as file:
|
||||
file.readline()
|
||||
actual = parser.read_csv(file)
|
||||
expected = parser.read_csv(StringIO(content))
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
@@ -0,0 +1,91 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.parsers import TextParser
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@xfail_pyarrow
|
||||
def test_read_data_list(all_parsers):
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
|
||||
|
||||
data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
with TextParser(data_list, chunksize=2, **kwargs) as parser:
|
||||
result = parser.read()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_reader_list(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
|
||||
lines = list(csv.reader(StringIO(data)))
|
||||
with TextParser(lines, chunksize=2, **kwargs) as reader:
|
||||
chunks = list(reader)
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
tm.assert_frame_equal(chunks[0], expected[:2])
|
||||
tm.assert_frame_equal(chunks[1], expected[2:4])
|
||||
tm.assert_frame_equal(chunks[2], expected[4:])
|
||||
|
||||
|
||||
def test_reader_list_skiprows(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
|
||||
lines = list(csv.reader(StringIO(data)))
|
||||
with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader:
|
||||
chunks = list(reader)
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
tm.assert_frame_equal(chunks[0], expected[1:3])
|
||||
|
||||
|
||||
def test_read_csv_parse_simple_list(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """foo
|
||||
bar baz
|
||||
qux foo
|
||||
foo
|
||||
bar"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,72 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,thousands,decimal",
|
||||
[
|
||||
(
|
||||
"""A|B|C
|
||||
1|2,334.01|5
|
||||
10|13|10.
|
||||
""",
|
||||
",",
|
||||
".",
|
||||
),
|
||||
(
|
||||
"""A|B|C
|
||||
1|2.334,01|5
|
||||
10|13|10,
|
||||
""",
|
||||
".",
|
||||
",",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data), sep="|", thousands=thousands, decimal=decimal
|
||||
)
|
||||
return
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), sep="|", thousands=thousands, decimal=decimal
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_euro_decimal_format(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||||
1;1521,1541;187101,9543;ABC;poi;4,738797819
|
||||
2;121,12;14897,76;DEF;uyt;0,377320872
|
||||
3;878,158;108013,434;GHI;rez;2,735694704"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=";", decimal=",")
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
|
||||
[2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
|
||||
[3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
|
||||
],
|
||||
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,478 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
)
|
||||
import os
|
||||
import platform
|
||||
from urllib.error import URLError
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
)
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.single_cpu
|
||||
def test_url(all_parsers, csv_dir_path, httpserver):
|
||||
parser = all_parsers
|
||||
kwargs = {"sep": "\t"}
|
||||
|
||||
local_path = os.path.join(csv_dir_path, "salaries.csv")
|
||||
with open(local_path, encoding="utf-8") as f:
|
||||
httpserver.serve_content(content=f.read())
|
||||
|
||||
url_result = parser.read_csv(httpserver.url, **kwargs)
|
||||
|
||||
local_result = parser.read_csv(local_path, **kwargs)
|
||||
tm.assert_frame_equal(url_result, local_result)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_local_file(all_parsers, csv_dir_path):
|
||||
parser = all_parsers
|
||||
kwargs = {"sep": "\t"}
|
||||
|
||||
local_path = os.path.join(csv_dir_path, "salaries.csv")
|
||||
local_result = parser.read_csv(local_path, **kwargs)
|
||||
url = "file://localhost/" + local_path
|
||||
|
||||
try:
|
||||
url_result = parser.read_csv(url, **kwargs)
|
||||
tm.assert_frame_equal(url_result, local_result)
|
||||
except URLError:
|
||||
# Fails on some systems.
|
||||
pytest.skip("Failing on: " + " ".join(platform.uname()))
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
def test_path_path_lib(all_parsers):
|
||||
parser = all_parsers
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
def test_path_local_path(all_parsers):
|
||||
parser = all_parsers
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
result = tm.round_trip_localpath(
|
||||
df.to_csv, lambda p: parser.read_csv(p, index_col=0)
|
||||
)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
def test_nonexistent_path(all_parsers):
|
||||
# gh-2428: pls no segfault
|
||||
# gh-14086: raise more helpful FileNotFoundError
|
||||
# GH#29233 "File foo" instead of "File b'foo'"
|
||||
parser = all_parsers
|
||||
path = f"{uuid.uuid4()}.csv"
|
||||
|
||||
msg = r"\[Errno 2\]"
|
||||
with pytest.raises(FileNotFoundError, match=msg) as e:
|
||||
parser.read_csv(path)
|
||||
assert path == e.value.filename
|
||||
|
||||
|
||||
@td.skip_if_windows # os.chmod does not work in windows
|
||||
def test_no_permission(all_parsers):
|
||||
# GH 23784
|
||||
parser = all_parsers
|
||||
|
||||
msg = r"\[Errno 13\]"
|
||||
with tm.ensure_clean() as path:
|
||||
os.chmod(path, 0) # make file unreadable
|
||||
|
||||
# verify that this process cannot open the file (not running as sudo)
|
||||
try:
|
||||
with open(path, encoding="utf-8"):
|
||||
pass
|
||||
pytest.skip("Running as sudo.")
|
||||
except PermissionError:
|
||||
pass
|
||||
|
||||
with pytest.raises(PermissionError, match=msg) as e:
|
||||
parser.read_csv(path)
|
||||
assert path == e.value.filename
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected,msg",
|
||||
[
|
||||
# gh-10728: WHITESPACE_LINE
|
||||
(
|
||||
"a,b,c\n4,5,6\n ",
|
||||
{},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# gh-10548: EAT_LINE_COMMENT
|
||||
(
|
||||
"a,b,c\n4,5,6\n#comment",
|
||||
{"comment": "#"},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_CRNL_NOP
|
||||
(
|
||||
"a,b,c\n4,5,6\n\r",
|
||||
{},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_COMMENT
|
||||
(
|
||||
"a,b,c\n4,5,6#comment",
|
||||
{"comment": "#"},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# SKIP_LINE
|
||||
(
|
||||
"a,b,c\n4,5,6\nskipme",
|
||||
{"skiprows": [2]},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_LINE_COMMENT
|
||||
(
|
||||
"a,b,c\n4,5,6\n#comment",
|
||||
{"comment": "#", "skip_blank_lines": False},
|
||||
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# IN_FIELD
|
||||
(
|
||||
"a,b,c\n4,5,6\n ",
|
||||
{"skip_blank_lines": False},
|
||||
DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# EAT_CRNL
|
||||
(
|
||||
"a,b,c\n4,5,6\n\r",
|
||||
{"skip_blank_lines": False},
|
||||
DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]),
|
||||
None,
|
||||
),
|
||||
# ESCAPED_CHAR
|
||||
(
|
||||
"a,b,c\n4,5,6\n\\",
|
||||
{"escapechar": "\\"},
|
||||
None,
|
||||
"(EOF following escape character)|(unexpected end of data)",
|
||||
),
|
||||
# ESCAPE_IN_QUOTED_FIELD
|
||||
(
|
||||
'a,b,c\n4,5,6\n"\\',
|
||||
{"escapechar": "\\"},
|
||||
None,
|
||||
"(EOF inside string starting at row 2)|(unexpected end of data)",
|
||||
),
|
||||
# IN_QUOTED_FIELD
|
||||
(
|
||||
'a,b,c\n4,5,6\n"',
|
||||
{"escapechar": "\\"},
|
||||
None,
|
||||
"(EOF inside string starting at row 2)|(unexpected end of data)",
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
"whitespace-line",
|
||||
"eat-line-comment",
|
||||
"eat-crnl-nop",
|
||||
"eat-comment",
|
||||
"skip-line",
|
||||
"eat-line-comment",
|
||||
"in-field",
|
||||
"eat-crnl",
|
||||
"escaped-char",
|
||||
"escape-in-quoted-field",
|
||||
"in-quoted-field",
|
||||
],
|
||||
)
|
||||
def test_eof_states(all_parsers, data, kwargs, expected, msg, request):
|
||||
# see gh-10728, gh-10548
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow" and "comment" in kwargs:
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
return
|
||||
|
||||
if parser.engine == "pyarrow" and "\r" not in data:
|
||||
# pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1:
|
||||
# ValueError: skiprows argument must be an integer when using engine='pyarrow'
|
||||
# AssertionError: Regex pattern did not match.
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
if expected is None:
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_temporary_file(all_parsers):
|
||||
# see gh-13398
|
||||
parser = all_parsers
|
||||
data = "0 0"
|
||||
|
||||
with tm.ensure_clean(mode="w+", return_filelike=True) as new_file:
|
||||
new_file.write(data)
|
||||
new_file.flush()
|
||||
new_file.seek(0)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(new_file, sep=r"\s+", header=None)
|
||||
return
|
||||
|
||||
result = parser.read_csv(new_file, sep=r"\s+", header=None)
|
||||
|
||||
expected = DataFrame([[0, 0]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_eof_byte(all_parsers):
|
||||
# see gh-5500
|
||||
parser = all_parsers
|
||||
data = "a,b\n1\x1a,2"
|
||||
|
||||
expected = DataFrame([["1\x1a", 2]], columns=["a", "b"])
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_eof_byte_to_file(all_parsers):
|
||||
# see gh-16559
|
||||
parser = all_parsers
|
||||
data = b'c1,c2\r\n"test \x1a test", test\r\n'
|
||||
expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"])
|
||||
path = f"__{uuid.uuid4()}__.csv"
|
||||
|
||||
with tm.ensure_clean(path) as path:
|
||||
with open(path, "wb") as f:
|
||||
f.write(data)
|
||||
|
||||
result = parser.read_csv(path)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_file_handle_string_io(all_parsers):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = all_parsers
|
||||
data = "a,b\n1,2"
|
||||
|
||||
fh = StringIO(data)
|
||||
parser.read_csv(fh)
|
||||
assert not fh.closed
|
||||
|
||||
|
||||
def test_file_handles_with_open(all_parsers, csv1):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = all_parsers
|
||||
|
||||
for mode in ["r", "rb"]:
|
||||
with open(csv1, mode, encoding="utf-8" if mode == "r" else None) as f:
|
||||
parser.read_csv(f)
|
||||
assert not f.closed
|
||||
|
||||
|
||||
def test_invalid_file_buffer_class(all_parsers):
|
||||
# see gh-15337
|
||||
class InvalidBuffer:
|
||||
pass
|
||||
|
||||
parser = all_parsers
|
||||
msg = "Invalid file path or buffer object type"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(InvalidBuffer())
|
||||
|
||||
|
||||
def test_invalid_file_buffer_mock(all_parsers):
|
||||
# see gh-15337
|
||||
parser = all_parsers
|
||||
msg = "Invalid file path or buffer object type"
|
||||
|
||||
class Foo:
|
||||
pass
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(Foo())
|
||||
|
||||
|
||||
def test_valid_file_buffer_seems_invalid(all_parsers):
|
||||
# gh-16135: we want to ensure that "tell" and "seek"
|
||||
# aren't actually being used when we call `read_csv`
|
||||
#
|
||||
# Thus, while the object may look "invalid" (these
|
||||
# methods are attributes of the `StringIO` class),
|
||||
# it is still a valid file-object for our purposes.
|
||||
class NoSeekTellBuffer(StringIO):
|
||||
def tell(self):
|
||||
raise AttributeError("No tell method")
|
||||
|
||||
def seek(self, pos, whence=0):
|
||||
raise AttributeError("No seek method")
|
||||
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(NoSeekTellBuffer(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("io_class", [StringIO, BytesIO])
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_read_csv_file_handle(all_parsers, io_class, encoding):
|
||||
"""
|
||||
Test whether read_csv does not close user-provided file handles.
|
||||
|
||||
GH 36980
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
|
||||
content = "a,b\n1,2"
|
||||
handle = io_class(content.encode("utf-8") if io_class == BytesIO else content)
|
||||
|
||||
tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected)
|
||||
assert not handle.closed
|
||||
|
||||
|
||||
def test_memory_map_compression(all_parsers, compression):
|
||||
"""
|
||||
Support memory map for compressed files.
|
||||
|
||||
GH 37621
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
expected.to_csv(path, index=False, compression=compression)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(path, memory_map=True, compression=compression)
|
||||
return
|
||||
|
||||
result = parser.read_csv(path, memory_map=True, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(
|
||||
result,
|
||||
expected,
|
||||
)
|
||||
|
||||
|
||||
def test_context_manager(all_parsers, datapath):
|
||||
# make sure that opened files are closed
|
||||
parser = all_parsers
|
||||
|
||||
path = datapath("io", "data", "csv", "iris.csv")
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(path, chunksize=1)
|
||||
return
|
||||
|
||||
reader = parser.read_csv(path, chunksize=1)
|
||||
assert not reader.handles.handle.closed
|
||||
try:
|
||||
with reader:
|
||||
next(reader)
|
||||
assert False
|
||||
except AssertionError:
|
||||
assert reader.handles.handle.closed
|
||||
|
||||
|
||||
def test_context_manageri_user_provided(all_parsers, datapath):
|
||||
# make sure that user-provided handles are not closed
|
||||
parser = all_parsers
|
||||
|
||||
with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path:
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(path, chunksize=1)
|
||||
return
|
||||
|
||||
reader = parser.read_csv(path, chunksize=1)
|
||||
assert not reader.handles.handle.closed
|
||||
try:
|
||||
with reader:
|
||||
next(reader)
|
||||
assert False
|
||||
except AssertionError:
|
||||
assert not reader.handles.handle.closed
|
||||
|
||||
|
||||
@skip_pyarrow # ParserError: Empty CSV file
|
||||
def test_file_descriptor_leak(all_parsers, using_copy_on_write):
|
||||
# GH 31488
|
||||
parser = all_parsers
|
||||
with tm.ensure_clean() as path:
|
||||
with pytest.raises(EmptyDataError, match="No columns to parse from file"):
|
||||
parser.read_csv(path)
|
||||
|
||||
|
||||
def test_memory_map(all_parsers, csv_dir_path):
|
||||
mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
|
||||
)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(mmap_file, memory_map=True)
|
||||
return
|
||||
|
||||
result = parser.read_csv(mmap_file, memory_map=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,79 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_linux
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block
|
||||
def test_float_parser(all_parsers):
|
||||
# see gh-9565
|
||||
parser = all_parsers
|
||||
data = "45e-1,4.5,45.,inf,-inf"
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
|
||||
expected = DataFrame([[float(s) for s in data.split(",")]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_scientific_no_exponent(all_parsers_all_precisions):
|
||||
# see gh-12215
|
||||
df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
|
||||
data = df.to_csv(index=False)
|
||||
parser, precision = all_parsers_all_precisions
|
||||
|
||||
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
|
||||
tm.assert_frame_equal(df_roundtrip, df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"neg_exp",
|
||||
[
|
||||
-617,
|
||||
-100000,
|
||||
pytest.param(-99999999999999999, marks=pytest.mark.skip_ubsan),
|
||||
],
|
||||
)
|
||||
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
|
||||
# GH#38753
|
||||
parser, precision = all_parsers_all_precisions
|
||||
|
||||
data = f"data\n10E{neg_exp}"
|
||||
result = parser.read_csv(StringIO(data), float_precision=precision)
|
||||
expected = DataFrame({"data": [0.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.skip_ubsan
|
||||
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
|
||||
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
|
||||
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
|
||||
# GH#38753
|
||||
parser, precision = all_parsers_all_precisions
|
||||
data = f"data\n10E{exp}"
|
||||
result = parser.read_csv(StringIO(data), float_precision=precision)
|
||||
if precision == "round_trip":
|
||||
if exp == 999999999999999999 and is_platform_linux():
|
||||
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
|
||||
request.applymarker(mark)
|
||||
|
||||
value = np.inf if exp > 0 else 0.0
|
||||
expected = DataFrame({"data": [value]})
|
||||
else:
|
||||
expected = DataFrame({"data": [f"10E{exp}"]})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,304 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
"""foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
""",
|
||||
{"index_col": 0, "names": ["index", "A", "B", "C", "D"]},
|
||||
DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),
|
||||
columns=["A", "B", "C", "D"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"""foo,one,2,3,4,5
|
||||
foo,two,7,8,9,10
|
||||
foo,three,12,13,14,15
|
||||
bar,one,12,13,14,15
|
||||
bar,two,12,13,14,15
|
||||
""",
|
||||
{"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]},
|
||||
DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
("foo", "one"),
|
||||
("foo", "two"),
|
||||
("foo", "three"),
|
||||
("bar", "one"),
|
||||
("bar", "two"),
|
||||
],
|
||||
names=["index1", "index2"],
|
||||
),
|
||||
columns=["A", "B", "C", "D"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_pass_names_with_index(all_parsers, data, kwargs, expected):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
|
||||
def test_multi_index_no_level_names(
|
||||
request, all_parsers, index_col, using_infer_string
|
||||
):
|
||||
data = """index1,index2,A,B,C,D
|
||||
foo,one,2,3,4,5
|
||||
foo,two,7,8,9,10
|
||||
foo,three,12,13,14,15
|
||||
bar,one,12,13,14,15
|
||||
bar,two,12,13,14,15
|
||||
"""
|
||||
headless_data = "\n".join(data.split("\n")[1:])
|
||||
|
||||
names = ["A", "B", "C", "D"]
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(headless_data), index_col=index_col, header=None, names=names
|
||||
)
|
||||
expected = parser.read_csv(StringIO(data), index_col=index_col)
|
||||
|
||||
# No index names in headless data.
|
||||
expected.index.names = [None] * 2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_multi_index_no_level_names_implicit(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
foo,one,2,3,4,5
|
||||
foo,two,7,8,9,10
|
||||
foo,three,12,13,14,15
|
||||
bar,one,12,13,14,15
|
||||
bar,two,12,13,14,15
|
||||
"""
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
("foo", "one"),
|
||||
("foo", "two"),
|
||||
("foo", "three"),
|
||||
("bar", "one"),
|
||||
("bar", "two"),
|
||||
]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # TypeError: an integer is required
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected,header",
|
||||
[
|
||||
("a,b", DataFrame(columns=["a", "b"]), [0]),
|
||||
(
|
||||
"a,b\nc,d",
|
||||
DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),
|
||||
[0, 1],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("round_trip", [True, False])
|
||||
def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
|
||||
# see gh-14545
|
||||
parser = all_parsers
|
||||
data = expected.to_csv(index=False) if round_trip else data
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.columns are different
|
||||
def test_no_unnamed_index(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """ id c0 c1 c2
|
||||
0 1 0 a b
|
||||
1 2 0 c d
|
||||
2 2 2 e f
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), sep=" ")
|
||||
expected = DataFrame(
|
||||
[[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],
|
||||
columns=["Unnamed: 0", "id", "c0", "c1", "c2"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_duplicate_index_explicit(all_parsers):
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo,12,13,14,15
|
||||
bar,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=0)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_read_duplicate_index_implicit(all_parsers):
|
||||
data = """A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo,12,13,14,15
|
||||
bar,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2, 3, 4, 5],
|
||||
[7, 8, 9, 10],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
[12, 13, 14, 15],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_read_csv_no_index_name(all_parsers, csv_dir_path):
|
||||
parser = all_parsers
|
||||
csv2 = os.path.join(csv_dir_path, "test2.csv")
|
||||
result = parser.read_csv(csv2, index_col=0, parse_dates=True)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],
|
||||
[1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],
|
||||
[0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],
|
||||
[1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],
|
||||
[-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],
|
||||
],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
index=Index(
|
||||
[
|
||||
datetime(2000, 1, 3),
|
||||
datetime(2000, 1, 4),
|
||||
datetime(2000, 1, 5),
|
||||
datetime(2000, 1, 6),
|
||||
datetime(2000, 1, 7),
|
||||
]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow
|
||||
def test_empty_with_index(all_parsers):
|
||||
# see gh-10184
|
||||
data = "x,y"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=0)
|
||||
|
||||
expected = DataFrame(columns=["y"], index=Index([], name="x"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# CSV parse error: Empty CSV file or block: cannot infer number of columns
|
||||
@skip_pyarrow
|
||||
def test_empty_with_multi_index(all_parsers):
|
||||
# see gh-10467
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=["x", "y"])
|
||||
|
||||
expected = DataFrame(
|
||||
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# CSV parse error: Empty CSV file or block: cannot infer number of columns
|
||||
@skip_pyarrow
|
||||
def test_empty_with_reversed_multi_index(all_parsers):
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=[1, 0])
|
||||
|
||||
expected = DataFrame(
|
||||
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,78 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_inf_parsing(all_parsers, na_filter):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
,A
|
||||
a,inf
|
||||
b,-inf
|
||||
c,+Inf
|
||||
d,-Inf
|
||||
e,INF
|
||||
f,-INF
|
||||
g,+INf
|
||||
h,-INf
|
||||
i,inF
|
||||
j,-inF"""
|
||||
expected = DataFrame(
|
||||
{"A": [float("inf"), float("-inf")] * 5},
|
||||
index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # AssertionError: DataFrame.index are different
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_infinity_parsing(all_parsers, na_filter):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
,A
|
||||
a,Infinity
|
||||
b,-Infinity
|
||||
c,+Infinity
|
||||
"""
|
||||
expected = DataFrame(
|
||||
{"A": [float("infinity"), float("-infinity"), float("+infinity")]},
|
||||
index=["a", "b", "c"],
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_with_use_inf_as_na(all_parsers):
|
||||
# https://github.com/pandas-dev/pandas/issues/35493
|
||||
parser = all_parsers
|
||||
data = "1.0\nNaN\n3.0"
|
||||
msg = "use_inf_as_na option is deprecated"
|
||||
warn = FutureWarning
|
||||
if parser.engine == "pyarrow":
|
||||
warn = (FutureWarning, DeprecationWarning)
|
||||
|
||||
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
|
||||
with option_context("use_inf_as_na", True):
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = DataFrame([1.0, np.nan, 3.0])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,231 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_int_conversion(all_parsers):
|
||||
data = """A,B
|
||||
1.0,1
|
||||
2.0,2
|
||||
3.0,3
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
"A,B\nTrue,1\nFalse,2\nTrue,3",
|
||||
{},
|
||||
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
|
||||
),
|
||||
(
|
||||
"A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
|
||||
{"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]},
|
||||
DataFrame(
|
||||
[[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]],
|
||||
columns=["A", "B"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"A,B\nTRUE,1\nFALSE,2\nTRUE,3",
|
||||
{},
|
||||
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
|
||||
),
|
||||
(
|
||||
"A,B\nfoo,bar\nbar,foo",
|
||||
{"true_values": ["foo"], "false_values": ["bar"]},
|
||||
DataFrame([[True, False], [False, True]], columns=["A", "B"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_parse_bool(all_parsers, data, kwargs, expected):
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_integers_above_fp_precision(all_parsers):
|
||||
data = """Numbers
|
||||
17007000002000191
|
||||
17007000002000191
|
||||
17007000002000191
|
||||
17007000002000191
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000192
|
||||
17007000002000194"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Numbers": [
|
||||
17007000002000191,
|
||||
17007000002000191,
|
||||
17007000002000191,
|
||||
17007000002000191,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000192,
|
||||
17007000002000194,
|
||||
]
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sep", [" ", r"\s+"])
|
||||
def test_integer_overflow_bug(all_parsers, sep):
|
||||
# see gh-2601
|
||||
data = "65248E10 11\n55555E55 22\n"
|
||||
parser = all_parsers
|
||||
if parser.engine == "pyarrow" and sep != " ":
|
||||
msg = "the 'pyarrow' engine does not support regex separators"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=None, sep=sep)
|
||||
return
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, sep=sep)
|
||||
expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_int64_min_issues(all_parsers):
|
||||
# see gh-2599
|
||||
parser = all_parsers
|
||||
data = "A,B\n0,0\n0,"
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
|
||||
def test_int64_overflow(all_parsers, conv, request):
|
||||
data = """ID
|
||||
00013007854817840016671868
|
||||
00013007854817840016749251
|
||||
00013007854817840016754630
|
||||
00013007854817840016781876
|
||||
00013007854817840017028824
|
||||
00013007854817840017963235
|
||||
00013007854817840018860166"""
|
||||
parser = all_parsers
|
||||
|
||||
if conv is None:
|
||||
# 13007854817840016671868 > UINT64_MAX, so this
|
||||
# will overflow and return object as the dtype.
|
||||
if parser.engine == "pyarrow":
|
||||
mark = pytest.mark.xfail(reason="parses to float64")
|
||||
request.applymarker(mark)
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
"00013007854817840016671868",
|
||||
"00013007854817840016749251",
|
||||
"00013007854817840016754630",
|
||||
"00013007854817840016781876",
|
||||
"00013007854817840017028824",
|
||||
"00013007854817840017963235",
|
||||
"00013007854817840018860166",
|
||||
],
|
||||
columns=["ID"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
# 13007854817840016671868 > UINT64_MAX, so attempts
|
||||
# to cast to either int64 or uint64 will result in
|
||||
# an OverflowError being raised.
|
||||
msg = "|".join(
|
||||
[
|
||||
"Python int too large to convert to C long",
|
||||
"long too big to convert",
|
||||
"int too big to convert",
|
||||
]
|
||||
)
|
||||
err = OverflowError
|
||||
if parser.engine == "pyarrow":
|
||||
err = ValueError
|
||||
msg = "The 'converters' option is not supported with the 'pyarrow' engine"
|
||||
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), converters={"ID": conv})
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize(
|
||||
"val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
|
||||
)
|
||||
def test_int64_uint64_range(all_parsers, val):
|
||||
# These numbers fall right inside the int64-uint64
|
||||
# range, so they should be parsed as string.
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(str(val)), header=None)
|
||||
|
||||
expected = DataFrame([val])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize(
|
||||
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
|
||||
)
|
||||
def test_outside_int64_uint64_range(all_parsers, val):
|
||||
# These numbers fall just outside the int64-uint64
|
||||
# range, so they should be parsed as string.
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(str(val)), header=None)
|
||||
|
||||
expected = DataFrame([str(val)])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@xfail_pyarrow # gets float64 dtype instead of object
|
||||
@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]])
|
||||
def test_numeric_range_too_wide(all_parsers, exp_data):
|
||||
# No numerical dtype can hold both negative and uint64
|
||||
# values, so they should be cast as string.
|
||||
parser = all_parsers
|
||||
data = "\n".join(exp_data)
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_integer_precision(all_parsers):
|
||||
# Gh 7072
|
||||
s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
|
||||
5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(s), header=None)[4]
|
||||
expected = Series([4321583677327450765, 4321113141090630389], name=4)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,134 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
def test_iterator(all_parsers):
|
||||
# see gh-6607
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
parser = all_parsers
|
||||
kwargs = {"index_col": 0}
|
||||
|
||||
expected = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), iterator=True, **kwargs)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader:
|
||||
first_chunk = reader.read(3)
|
||||
tm.assert_frame_equal(first_chunk, expected[:3])
|
||||
|
||||
last_chunk = reader.read(5)
|
||||
tm.assert_frame_equal(last_chunk, expected[3:])
|
||||
|
||||
|
||||
def test_iterator2(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), iterator=True)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), iterator=True) as reader:
|
||||
result = list(reader)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result[0], expected)
|
||||
|
||||
|
||||
def test_iterator_stop_on_chunksize(all_parsers):
|
||||
# gh-3967: stopping iteration when chunksize is specified
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), chunksize=1)
|
||||
return
|
||||
|
||||
with parser.read_csv(StringIO(data), chunksize=1) as reader:
|
||||
result = list(reader)
|
||||
|
||||
assert len(result) == 3
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(concat(result), expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}]
|
||||
)
|
||||
def test_iterator_skipfooter_errors(all_parsers, kwargs):
|
||||
msg = "'skipfooter' not supported for iteration"
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"The '(chunksize|iterator)' option is not supported with the "
|
||||
"'pyarrow' engine"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _:
|
||||
pass
|
||||
|
||||
|
||||
def test_iteration_open_handle(all_parsers):
|
||||
parser = all_parsers
|
||||
kwargs = {"header": None}
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
|
||||
|
||||
with open(path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if "CCC" in line:
|
||||
break
|
||||
|
||||
result = parser.read_csv(f, **kwargs)
|
||||
expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,320 @@
|
||||
"""
|
||||
Tests that work on the Python, C and PyArrow engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
import codecs
|
||||
import csv
|
||||
from io import StringIO
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY311
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
|
||||
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
|
||||
|
||||
|
||||
def test_empty_decimal_marker(all_parsers):
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
# Parsers support only length-1 decimals
|
||||
msg = "Only length-1 decimal markers supported"
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"only single character unicode strings can be "
|
||||
"converted to Py_UCS4, got length 0"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), decimal="")
|
||||
|
||||
|
||||
def test_bad_stream_exception(all_parsers, csv_dir_path):
|
||||
# see gh-13652
|
||||
#
|
||||
# This test validates that both the Python engine and C engine will
|
||||
# raise UnicodeDecodeError instead of C engine raising ParserError
|
||||
# and swallowing the exception that caused read to fail.
|
||||
path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv")
|
||||
codec = codecs.lookup("utf-8")
|
||||
utf8 = codecs.lookup("utf-8")
|
||||
parser = all_parsers
|
||||
msg = "'utf-8' codec can't decode byte"
|
||||
|
||||
# Stream must be binary UTF8.
|
||||
with open(path, "rb") as handle, codecs.StreamRecoder(
|
||||
handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter
|
||||
) as stream:
|
||||
with pytest.raises(UnicodeDecodeError, match=msg):
|
||||
parser.read_csv(stream)
|
||||
|
||||
|
||||
def test_malformed(all_parsers):
|
||||
# see gh-6607
|
||||
parser = all_parsers
|
||||
data = """ignore
|
||||
A,B,C
|
||||
1,2,3 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
"""
|
||||
msg = "Expected 3 fields in line 4, saw 5"
|
||||
err = ParserError
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
|
||||
err = ValueError
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(StringIO(data), header=1, comment="#")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [5, 3, None])
|
||||
def test_malformed_chunks(all_parsers, nrows):
|
||||
data = """ignore
|
||||
A,B,C
|
||||
skip
|
||||
1,2,3
|
||||
3,5,10 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
"""
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'iterator' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=1,
|
||||
comment="#",
|
||||
iterator=True,
|
||||
chunksize=1,
|
||||
skiprows=[2],
|
||||
)
|
||||
return
|
||||
|
||||
msg = "Expected 3 fields in line 6, saw 5"
|
||||
with parser.read_csv(
|
||||
StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2]
|
||||
) as reader:
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
reader.read(nrows)
|
||||
|
||||
|
||||
@xfail_pyarrow # does not raise
|
||||
def test_catch_too_many_names(all_parsers):
|
||||
# see gh-5156
|
||||
data = """\
|
||||
1,2,3
|
||||
4,,6
|
||||
7,8,9
|
||||
10,11,12\n"""
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Too many columns specified: expected 4 and found 3"
|
||||
if parser.engine == "c"
|
||||
else "Number of passed names did not match "
|
||||
"number of header fields in the file"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
|
||||
|
||||
|
||||
@skip_pyarrow # CSV parse error: Empty CSV file or block
|
||||
@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
|
||||
def test_raise_on_no_columns(all_parsers, nrows):
|
||||
parser = all_parsers
|
||||
data = "\n" * nrows
|
||||
|
||||
msg = "No columns to parse from file"
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
|
||||
|
||||
def test_unexpected_keyword_parameter_exception(all_parsers):
|
||||
# GH-34976
|
||||
parser = all_parsers
|
||||
|
||||
msg = "{}\\(\\) got an unexpected keyword argument 'foo'"
|
||||
with pytest.raises(TypeError, match=msg.format("read_csv")):
|
||||
parser.read_csv("foo.csv", foo=1)
|
||||
with pytest.raises(TypeError, match=msg.format("read_table")):
|
||||
parser.read_table("foo.tsv", foo=1)
|
||||
|
||||
|
||||
def test_suppress_error_output(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
expected = DataFrame({"a": [1, 4]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), on_bad_lines="skip")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_error_bad_lines(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
|
||||
msg = "Expected 1 fields in line 3, saw 3"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
# "CSV parse error: Expected 1 columns, got 3: 1,2,3"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), on_bad_lines="error")
|
||||
|
||||
|
||||
def test_warn_bad_lines(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
expected = DataFrame({"a": [1, 4]})
|
||||
match_msg = "Skipping line"
|
||||
|
||||
expected_warning = ParserWarning
|
||||
if parser.engine == "pyarrow":
|
||||
match_msg = "Expected 1 columns, but found 3: 1,2,3"
|
||||
expected_warning = (ParserWarning, DeprecationWarning)
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
expected_warning, match=match_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_wrong_num_columns(all_parsers):
|
||||
# Too few columns.
|
||||
data = """A,B,C,D,E,F
|
||||
1,2,3,4,5,6
|
||||
6,7,8,9,10,11,12
|
||||
11,12,13,14,15,16
|
||||
"""
|
||||
parser = all_parsers
|
||||
msg = "Expected 6 fields in line 3, saw 7"
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
# Expected 6 columns, got 7: 6,7,8,9,10,11,12
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
|
||||
|
||||
def test_null_byte_char(request, all_parsers):
|
||||
# see gh-2741
|
||||
data = "\x00,foo"
|
||||
names = ["a", "b"]
|
||||
parser = all_parsers
|
||||
|
||||
if parser.engine == "c" or (parser.engine == "python" and PY311):
|
||||
if parser.engine == "python" and PY311:
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="In Python 3.11, this is read as an empty character not null"
|
||||
)
|
||||
)
|
||||
expected = DataFrame([[np.nan, "foo"]], columns=names)
|
||||
out = parser.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
else:
|
||||
if parser.engine == "pyarrow":
|
||||
# CSV parse error: Empty CSV file or block: "
|
||||
# cannot infer number of columns"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
else:
|
||||
msg = "NULL byte detected"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("always::ResourceWarning")
|
||||
def test_open_file(request, all_parsers):
|
||||
# GH 39024
|
||||
parser = all_parsers
|
||||
|
||||
msg = "Could not determine delimiter"
|
||||
err = csv.Error
|
||||
if parser.engine == "c":
|
||||
msg = "the 'c' engine does not support sep=None with delim_whitespace=False"
|
||||
err = ValueError
|
||||
elif parser.engine == "pyarrow":
|
||||
msg = (
|
||||
"the 'pyarrow' engine does not support sep=None with delim_whitespace=False"
|
||||
)
|
||||
err = ValueError
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
file = Path(path)
|
||||
file.write_bytes(b"\xe4\na\n1")
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
# should not trigger a ResourceWarning
|
||||
with pytest.raises(err, match=msg):
|
||||
parser.read_csv(file, sep=None, encoding_errors="replace")
|
||||
|
||||
|
||||
def test_invalid_on_bad_line(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n1,2,3\n4\n5,6,7"
|
||||
with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"):
|
||||
parser.read_csv(StringIO(data), on_bad_lines="abc")
|
||||
|
||||
|
||||
def test_bad_header_uniform_error(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n"
|
||||
msg = "Expected 2 fields in line 2, saw 4"
|
||||
if parser.engine == "c":
|
||||
msg = (
|
||||
"Could not construct index. Requested to use 1 "
|
||||
"number of columns, but 3 left to parse."
|
||||
)
|
||||
elif parser.engine == "pyarrow":
|
||||
# "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4"
|
||||
pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error")
|
||||
|
||||
|
||||
def test_on_bad_lines_warn_correct_formatting(all_parsers):
|
||||
# see gh-15925
|
||||
parser = all_parsers
|
||||
data = """1,2
|
||||
a,b
|
||||
a,b,c
|
||||
a,b,d
|
||||
a,b
|
||||
"""
|
||||
expected = DataFrame({"1": "a", "2": ["b"] * 2})
|
||||
match_msg = "Skipping line"
|
||||
|
||||
expected_warning = ParserWarning
|
||||
if parser.engine == "pyarrow":
|
||||
match_msg = "Expected 2 columns, but found 3: a,b,c"
|
||||
expected_warning = (ParserWarning, DeprecationWarning)
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
expected_warning, match=match_msg, check_stacklevel=False
|
||||
):
|
||||
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,81 @@
|
||||
"""
|
||||
Tests that work on both the Python and C engines but do not have a
|
||||
specific classification into the other test modules.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated"
|
||||
|
||||
|
||||
def test_verbose_read(all_parsers, capsys):
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
one,1,2,3
|
||||
one,1,2,3
|
||||
,1,2,3
|
||||
one,1,2,3
|
||||
,1,2,3
|
||||
,1,2,3
|
||||
one,1,2,3
|
||||
two,1,2,3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'verbose' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True)
|
||||
return
|
||||
|
||||
# Engines are verbose in different ways.
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True)
|
||||
captured = capsys.readouterr()
|
||||
|
||||
if parser.engine == "c":
|
||||
assert "Tokenization took:" in captured.out
|
||||
assert "Parser memory cleanup took:" in captured.out
|
||||
else: # Python engine
|
||||
assert captured.out == "Filled 3 NA values in column a\n"
|
||||
|
||||
|
||||
def test_verbose_read2(all_parsers, capsys):
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d
|
||||
one,1,2,3
|
||||
two,1,2,3
|
||||
three,1,2,3
|
||||
four,1,2,3
|
||||
five,1,2,3
|
||||
,1,2,3
|
||||
seven,1,2,3
|
||||
eight,1,2,3"""
|
||||
|
||||
if parser.engine == "pyarrow":
|
||||
msg = "The 'verbose' option is not supported with the 'pyarrow' engine"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True, index_col=0)
|
||||
return
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, match=depr_msg, check_stacklevel=False
|
||||
):
|
||||
parser.read_csv(StringIO(data), verbose=True, index_col=0)
|
||||
captured = capsys.readouterr()
|
||||
|
||||
# Engines are verbose in different ways.
|
||||
if parser.engine == "c":
|
||||
assert "Tokenization took:" in captured.out
|
||||
assert "Parser memory cleanup took:" in captured.out
|
||||
else: # Python engine
|
||||
assert captured.out == "Filled 1 NA values in column a\n"
|
||||
Reference in New Issue
Block a user