root-project · jblomer · Mar 22, 2024 · Mar 23, 2024 · Mar 23, 2024 · Mar 24, 2024
@@ -294,6 +294,25 @@ def RDF(self):
         try:
             from ._pythonization._rdataframe import _MakeNumpyDataFrame
 
+            # Provide a FromCSV factory method that uses keyword arguments instead of the ROptions config struct.
+            # In Python, the RCsvDS::ROptions struct members are available without the leading 'f' and in camelCase,
+            # e.g. fDelimiter --> delimiter.
+            # We need to keep the parameters of the old FromCSV signature for backward compatibility.
+            ns._FromCSV = ns.FromCSV
+            def MakeCSVDataFrame(
+                    fileName, readHeaders = True, delimiter = ',', linesChunkSize = -1, colTypes = {}, **kwargs):
+                options = ns.RCsvDS.ROptions()
+                options.fHeaders = readHeaders
+                options.fDelimiter = delimiter
+                options.fLinesChunkSize = linesChunkSize
+                options.fColumnTypes = colTypes
+                for key, val in kwargs.items():
+                    structMemberName = 'f' + key[0].upper() + key[1:]
+                    if hasattr(options, structMemberName):
+                        setattr(options, structMemberName, val)
+                return ns._FromCSV(fileName, options)
+            ns.FromCSV = MakeCSVDataFrame
+
             # Make a copy of the arrays that have strides to make sure we read the correct values
             # TODO a cleaner fix
             def MakeNumpyDataFrameCopy(np_dict):

@@ -33,6 +33,31 @@ class RRawFile;
 namespace RDF {
 
 class RCsvDS final : public ROOT::RDF::RDataSource {
+public:
+   /// Options that control how the CSV file is parsed
+   struct ROptions {
+      /// The first line describes the columns. The names are used as RDF column names
+      /// unless fColumnNames is not empty, in which case it replaces the given names.
+      /// If both, fHeaders is false and fColumnNames is empty, generic column names Col1.n.Col$n$ are used.
+      bool fHeaders = true;
+      char fDelimiter = ',';             ///< Column delimiter character
+      bool fLeftTrim = false;            ///< Leading whitespaces are removed
+      bool fRightTrim = false;           ///< Trailing whitespaces are removed
+      bool fSkipBlankLines = true;       ///< Ignore empty lines (after trimming, if trimming is enabled)
+      std::int64_t fSkipFirstNLines = 0; ///< Ignore the first N lines of the file
+      std::int64_t fSkipLastNLines = 0;  ///< Ignore the last N lines of the file
+      std::int64_t fLinesChunkSize = -1; ///< Number of lines to read, -1 to read all
+      /// Character indicating that the remainder of the line should be ignored, if different from '\0'.
+      /// If it is the first character of the line (after trimming), the line is ignored altogether.
+      /// Note that the comment character must not be part of the data, e.g. in strings.
+      char fComment = '\0';
+      /// Impose column names. This can be used if a header is missing or if the header has unparsable or
+      /// unwanted column names.
+      std::vector<std::string> fColumnNames;
+      /// Specify custom column types, accepts an unordered map with keys being column name, values being type alias
+      /// ('O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string)
+      std::unordered_map<std::string, char> fColumnTypes;
+   };
 
 private:
    // Possible values are D, O, L, T. This is possible only because we treat double, bool, Long64_t and string
@@ -42,12 +67,13 @@ private:
    // Regular expressions for type inference
    static const TRegexp fgIntRegex, fgDoubleRegex1, fgDoubleRegex2, fgDoubleRegex3, fgTrueRegex, fgFalseRegex;
 
+   ROptions fOptions;
    std::uint64_t fDataPos = 0;
-   bool fReadHeaders = false;
+   std::int64_t fDataLineNumber = 0;
+   std::int64_t fLineNumber = 0;     // used to skip the last lines
+   std::int64_t fMaxLineNumber = -1; // set to non-negative if fOptions.fSkipLastNLines is set
    unsigned int fNSlots = 0U;
    std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
-   const char fDelimiter;
-   const Long64_t fLinesChunkSize;
    ULong64_t fEntryRangesRequested = 0ULL;
    ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
    std::vector<std::string> fHeaders; // the column names
@@ -63,6 +89,10 @@ private:
    // work given that the pointer to the boolean in that case cannot be taken
    std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot
 
+   void Construct();
+
+   bool Readln(std::string &line);
+   void RewindToData();
    void FillHeaders(const std::string &);
    void FillRecord(const std::string &, Record_t &);
    void GenerateHeaders(size_t);
@@ -79,6 +109,7 @@ protected:
    std::string AsString() final;
 
 public:
+   RCsvDS(std::string_view fileName, const ROptions &options);
    RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL,
           std::unordered_map<std::string, char> &&colTypes = {});
    void Finalize() final;
@@ -92,6 +123,12 @@ public:
    std::string GetLabel() final;
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Factory method to create a CSV RDataFrame.
+/// \param[in] fileName Path of the CSV file.
+/// \param[in] options File parsing settings.
+RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options);
+
 ////////////////////////////////////////////////////////////////////////////////////////////////
 /// \brief Factory method to create a CSV RDataFrame.
 /// \param[in] fileName Path of the CSV file.