Skip to content

Commit

Permalink
C Parser: use io.DEFAULT_BUFFER_SIZE for read buffer
Browse files Browse the repository at this point in the history
  • Loading branch information
MKuranowski committed Feb 20, 2024
1 parent 17e29cb commit b008b1e
Showing 1 changed file with 32 additions and 22 deletions.
54 changes: 32 additions & 22 deletions aiocsv/_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,11 +213,6 @@ int Dialect_init(Dialect* d, PyObject* o) {
return 1;
}

#define PARSER_BUFFER_CAPACITY 4096
static_assert(
PARSER_BUFFER_CAPACITY <= USHRT_MAX,
"Parser buffer is indexed by unsigned short - capacity must be smaller than USHRT_MAX");

typedef struct {
// clang-format off
PyObject_HEAD
Expand All @@ -234,8 +229,15 @@ typedef struct {
/// Generator[Any, None, str] if waiting for a read, NULL otherwise.
PyObject* current_read;

/// Data returned by the latest read
Py_UCS4 buffer[PARSER_BUFFER_CAPACITY];
/// Data returned by the latest read. If not null, must be of at least
/// `module->io_default_buffer_size` bytes.
Py_UCS4* buffer;

/// Number of valid characters in buffer
Py_ssize_t buffer_len;

/// Offset into buffer to the first valid character
Py_ssize_t buffer_idx;

/// list[str] with parsed fields from the current record. Lazily allocated, may be NULL.
PyObject* record_so_far;
Expand All @@ -259,12 +261,6 @@ typedef struct {
/// a one-based line number of the last-encountered line.
unsigned int line_num;

/// Number of valid characters in buffer
unsigned short buffer_len;

/// Offset into buffer to the first valid character
unsigned short buffer_idx;

/// ParserState for the parser state machine.
unsigned char state;

Expand All @@ -284,6 +280,10 @@ static void Parser_dealloc(Parser* self) {
PyTypeObject* tp = Py_TYPE(self);
PyObject_GC_UnTrack(self);
tp->tp_clear((PyObject*)self);
if (self->buffer) {
PyMem_Free(self->buffer);
self->buffer = NULL;
}
if (self->field_so_far) {
PyMem_Free(self->field_so_far);
self->field_so_far = NULL;
Expand Down Expand Up @@ -349,12 +349,13 @@ static PyObject* Parser_new(PyObject* module, PyObject* args, PyObject* kwargs)

self->current_read = NULL;
self->record_so_far = NULL;
self->buffer = NULL;
self->buffer_len = 0;
self->buffer_idx = 0;
self->field_so_far = NULL;
self->field_so_far_capacity = 0;
self->field_so_far_len = 0;
self->line_num = 0;
self->buffer_len = 0;
self->buffer_idx = 0;
self->state = STATE_START_RECORD;
self->field_was_numeric = false;
self->last_char_was_cr = false;
Expand Down Expand Up @@ -643,7 +644,7 @@ static int Parser_initiate_read(Parser* self) {
name = PyUnicode_FromStringAndSize("read", 4);
if (!name) FINISH_WITH(0);

len = PyLong_FromLong(PARSER_BUFFER_CAPACITY);
len = PyLong_FromLong(module_get_state(self->module)->io_default_buffer_size);
if (!len) FINISH_WITH(0);

read_coro = PyObject_CallMethodOneArg(self->reader, name, len);
Expand Down Expand Up @@ -732,22 +733,31 @@ static int Parser_copy_to_buffer(Parser* self, PyObject* unicode) {
FINISH_WITH(0);
}

Py_ssize_t cap = module_get_state(self->module)->io_default_buffer_size;
Py_ssize_t len = PyUnicode_GetLength(unicode);
if (len < 0) {
FINISH_WITH(0);
} else if (len == 0) {
self->buffer_len = 0;
self->buffer_idx = 0;
self->eof = true;
} else if (len <= PARSER_BUFFER_CAPACITY) {
if (!PyUnicode_AsUCS4(unicode, self->buffer, PARSER_BUFFER_CAPACITY, false))
FINISH_WITH(0);
self->buffer_len = (unsigned short)len;
} else if (len <= cap) {
// Allocate the buffer if it was not allocated beforehand
if (!self->buffer) {
PyMem_Resize(self->buffer, Py_UCS4, cap);
if (!self->buffer) {
PyErr_NoMemory();
FINISH_WITH(0);
}
}

if (!PyUnicode_AsUCS4(unicode, self->buffer, cap, false)) FINISH_WITH(0);
self->buffer_len = len;
self->buffer_idx = 0;
} else {
PyErr_Format(PyExc_ValueError,
"reader has read %zi bytes, which is more than the requested %i bytes", len,
PARSER_BUFFER_CAPACITY);
"reader has read %zi bytes, which is more than the requested %zi bytes", len,
cap);
FINISH_WITH(0);
}

Expand Down

0 comments on commit b008b1e

Please sign in to comment.