Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DF] Extend parsing capabilities of CSV data source #15045

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions bindings/pyroot/pythonizations/python/ROOT/_facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,25 @@ def RDF(self):
try:
from ._pythonization._rdataframe import _MakeNumpyDataFrame

# Provide a FromCSV factory method that uses keyword arguments instead of the ROptions config struct.
# In Python, the RCsvDS::ROptions struct members are available without the leading 'f' and in camelCase,
# e.g. fDelimiter --> delimiter.
# We need to keep the parameters of the old FromCSV signature for backward compatibility.
ns._FromCSV = ns.FromCSV
def MakeCSVDataFrame(
fileName, readHeaders = True, delimiter = ',', linesChunkSize = -1, colTypes = {}, **kwargs):
options = ns.RCsvDS.ROptions()
options.fHeaders = readHeaders
options.fDelimiter = delimiter
options.fLinesChunkSize = linesChunkSize
options.fColumnTypes = colTypes
for key, val in kwargs.items():
structMemberName = 'f' + key[0].upper() + key[1:]
if hasattr(options, structMemberName):
setattr(options, structMemberName, val)
return ns._FromCSV(fileName, options)
ns.FromCSV = MakeCSVDataFrame

# Make a copy of the arrays that have strides to make sure we read the correct values
# TODO a cleaner fix
def MakeNumpyDataFrameCopy(np_dict):
Expand Down
43 changes: 40 additions & 3 deletions tree/dataframe/inc/ROOT/RCsvDS.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,31 @@ class RRawFile;
namespace RDF {

class RCsvDS final : public ROOT::RDF::RDataSource {
public:
/// Options that control how the CSV file is parsed
struct ROptions {
/// The first line describes the columns. The names are used as RDF column names
/// unless fColumnNames is not empty, in which case it replaces the given names.
/// If both, fHeaders is false and fColumnNames is empty, generic column names Col1.n.Col$n$ are used.
bool fHeaders = true;
char fDelimiter = ','; ///< Column delimiter character
bool fLeftTrim = false; ///< Leading whitespaces are removed
bool fRightTrim = false; ///< Trailing whitespaces are removed
bool fSkipBlankLines = true; ///< Ignore empty lines (after trimming, if trimming is enabled)
std::int64_t fSkipFirstNLines = 0; ///< Ignore the first N lines of the file
std::int64_t fSkipLastNLines = 0; ///< Ignore the last N lines of the file
std::int64_t fLinesChunkSize = -1; ///< Number of lines to read, -1 to read all
/// Character indicating that the remainder of the line should be ignored, if different from '\0'.
/// If it is the first character of the line (after trimming), the line is ignored altogether.
/// Note that the comment character must not be part of the data, e.g. in strings.
char fComment = '\0';
/// Impose column names. This can be used if a header is missing or if the header has unparsable or
/// unwanted column names.
std::vector<std::string> fColumnNames;
/// Specify custom column types, accepts an unordered map with keys being column name, values being type alias
/// ('O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string)
std::unordered_map<std::string, char> fColumnTypes;
};

private:
// Possible values are D, O, L, T. This is possible only because we treat double, bool, Long64_t and string
Expand All @@ -42,12 +67,13 @@ private:
// Regular expressions for type inference
static const TRegexp fgIntRegex, fgDoubleRegex1, fgDoubleRegex2, fgDoubleRegex3, fgTrueRegex, fgFalseRegex;

ROptions fOptions;
std::uint64_t fDataPos = 0;
bool fReadHeaders = false;
std::int64_t fDataLineNumber = 0;
std::int64_t fLineNumber = 0; // used to skip the last lines
std::int64_t fMaxLineNumber = -1; // set to non-negative if fOptions.fSkipLastNLines is set
unsigned int fNSlots = 0U;
std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
const char fDelimiter;
const Long64_t fLinesChunkSize;
ULong64_t fEntryRangesRequested = 0ULL;
ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
std::vector<std::string> fHeaders; // the column names
Expand All @@ -63,6 +89,10 @@ private:
// work given that the pointer to the boolean in that case cannot be taken
std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot

void Construct();

bool Readln(std::string &line);
void RewindToData();
void FillHeaders(const std::string &);
void FillRecord(const std::string &, Record_t &);
void GenerateHeaders(size_t);
Expand All @@ -79,6 +109,7 @@ protected:
std::string AsString() final;

public:
RCsvDS(std::string_view fileName, const ROptions &options);
RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL,
std::unordered_map<std::string, char> &&colTypes = {});
void Finalize() final;
Expand All @@ -92,6 +123,12 @@ public:
std::string GetLabel() final;
};

////////////////////////////////////////////////////////////////////////////////////////////////
/// \brief Factory method to create a CSV RDataFrame.
/// \param[in] fileName Path of the CSV file.
/// \param[in] options File parsing settings.
RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options);

////////////////////////////////////////////////////////////////////////////////////////////////
/// \brief Factory method to create a CSV RDataFrame.
/// \param[in] fileName Path of the CSV file.
Expand Down
Loading
Loading