Merge pull request #19 from TidierOrg/add-list_files

add `list_files`
TidierOrg · Sep 9, 2024 · 7075aee · 7075aee · drizk1 · Sep 9, 2024
2 parents a57a5b4 + a675015
commit 7075aee
Show file tree

Hide file tree

Showing 6 changed files with 94 additions and 72 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierFiles"
 uuid = "8ae5e7a9-bdd3-4c93-9cc3-9df4d5d947db"
 authors = ["Daniel Rizk <[email protected]> and contributors"]
-version = "0.1.4"
+version = "0.1.5"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"

diff --git a/README.md b/README.md
@@ -29,6 +29,8 @@ Currently supported file types:
 Agnostic read and write functions that detect the type and dispatch the appropriate function. 
 - `read_file` and `write_file` 
 
+`list_files` to list files in a directory.
+
 # Examples
 
 Here is an example of how to write and read a CSV file.

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -26,6 +26,8 @@ Currently supported file types:
 Agnostic read and write functions that detect the type and dispatch the appropriate function. 
 - `read_file` and `write_file` 
 
+`list_files` to list files in a directory.
+
 # Examples
 
 Here is an example of how to write and read a CSV file.

diff --git a/src/TidierFiles.jl b/src/TidierFiles.jl
@@ -15,8 +15,8 @@ using RData
 
 export read_csv, write_csv, read_tsv, write_tsv, read_table, write_table, read_delim, read_xlsx, write_xlsx, 
  read_fwf, write_fwf, fwf_empty, fwf_positions, fwf_positions, read_sav, read_sas, read_dta, write_sav, write_sas, 
- write_dta, read_arrow, write_arrow, read_parquet, write_parquet, read_csv2, read_file, write_file, read_rdata
- 
+ write_dta, read_arrow, write_arrow, read_parquet, write_parquet, read_csv2, read_file, write_file, read_rdata, list_files
+
 
 include("docstrings.jl")
 include("fwf.jl")
@@ -448,4 +448,21 @@ end
 
 include("gen_fxn.jl")
 
+"""
+$docstring_list_files
+"""
+function list_files(path = "", pattern = "")
+    files = map(walkdir(path)) do (root, dirs, files)
+        joinpath.(root, files)
+    end
+    if isempty(files)
+        error("No files ending with $pattern located at $path")
+    else
+        files = reduce(vcat, files)
+        files = filter(x -> occursin(pattern, x), files)
+    end
+    return files
+end
+
+
 end
diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -305,12 +305,12 @@ julia> write_xlsx(("REPORT_A" => df, "REPORT_B" => df2); path="xlsxtest.xlsx", o
 
 julia> read_xlsx("xlsxtest.xlsx", sheet = "REPORT_A", skip = 1, n_max = 4, missingstring = [2])
 3×3 DataFrame
- Row │ integers  strings               floats  
-     │ Any       String                Float64 
-─────┼─────────────────────────────────────────
-   1 │ missing   Package makes            20.3
-   2 │ 3         File reading/writing     30.4
-   3 │ 4         even smoother            40.5
+ Row │ integers  strings               floats   
+     │ Int64?    String?               Float64? 
+─────┼──────────────────────────────────────────
+   1 │  missing  Package makes             20.3
+   2 │        3  File reading/writing      30.4
+   3 │        4  even smoother             40.5
 ```
 """
 
@@ -473,6 +473,7 @@ julia> write_sav(df, "test.por")
    2 │    por      10.2
 ```
 """
+
 const docstring_write_sas =
 """
     write_sas(df, path)
@@ -674,4 +675,17 @@ Read `.rdata` and `.rds` files as DataFrame. `.rdata` files will result in a `Di
 
 # Arguments
 - `path`: A string with the file location. This does not yet support reading from URLs. 
+"""
+
+const docstring_list_files =
+"""
+    list_files(path = "", pattern = "")
+List all files in a directory that match a given pattern.
+
+# Arguments
+- `path`: The directory path to list files from. Defaults to an empty string.
+- `pattern`: A string pattern to filter the files. Defaults to an empty string, matching all files. ie `.csv` will only return files ending in .csv
+
+# Examples
+- `list_files("/path/to/folder/", ".csv")`
 """
diff --git a/src/xlfiles.jl b/src/xlfiles.jl
@@ -1,37 +1,43 @@
-function infer_type(value)
-    if isa(value, Missing)
-        return Missing
-    elseif isa(value, Number)
-        if isa(value, Int) || isa(value, Bool)
-            return Int
-        else
-            return Float64
-        end
-    elseif isa(value, DateTime)
-        return DateTime
-    elseif isa(value, Time)
-        return Time
-    elseif isa(value, Date)
+function infer_column_type(values)
+    nonmissing_values = filter(x -> x !== missing, values)
+    first_values = nonmissing_values[1:min(5, length(nonmissing_values))]
+
+    # Check if all values are already integers
+    if all(x -> isa(x, Int), first_values)
+        return Int
+    # Check if all values are already floats
+    elseif all(x -> isa(x, Float64), first_values)
+        return Float64
+    # Check if all values are integers or can be parsed as integers
+    elseif all(x -> isa(x, Int) || tryparse(Int, string(x)) !== nothing, first_values)
+        return Int
+    # Check if all values are floats or can be parsed as floats
+    elseif all(x -> isa(x, Float64) || tryparse(Float64, string(x)) !== nothing, first_values)
+        return Float64
+    # Check if all values are dates or can be parsed as dates
+    elseif all(x -> isa(x, Date) || tryparse(Date, string(x), dateformat"yyyy-mm-dd") !== nothing, first_values)
         return Date
+    # Default to String
     else
         return String
     end
 end
 
-function convert_column(column)
-    non_missing_values = filter(!ismissing, column)
-    if isempty(non_missing_values)
-        return column  # Return as-is if all values are missing
-    end
-
-    target_type = reduce((x, y) -> x === y ? x : String, map(infer_type, non_missing_values))
-    try
-        return target_type == Missing ? column : convert(Vector{target_type}, column)
-    catch
-        return column  # Fallback to original if conversion fails
+# Function to convert a column to the inferred type
+# Function to convert a column to the inferred type
+function convert_column(col, inferred_type)
+    if inferred_type == Int
+        return [x === missing ? missing : isa(x, Int) ? x : tryparse(Int, string(x)) for x in col]
+    elseif inferred_type == Float64
+        return [x === missing ? missing : isa(x, Float64) ? x : tryparse(Float64, string(x)) for x in col]
+    elseif inferred_type == Date
+        return [x === missing ? missing : isa(x, Date) ? x : tryparse(Date, string(x), dateformat"yyyy-mm-dd") for x in col]
+    else
+        return [x === missing ? missing : convert(String, x) for x in col]
     end
 end
 
+
 """
 $docstring_read_xlsx
 """
@@ -40,70 +46,56 @@ function read_xlsx(
     sheet = nothing,
     range = nothing,
     col_names = true,
-    col_types = nothing,
     missingstring = "",
     trim_ws = true,
     skip = 0,
-    n_max = Inf,
-    guess_max = nothing)
-
-
-    if startswith(path, "http://") || startswith(path, "https://")
-        # Fetch the content from the URL
+    n_max = Inf
+)
+    # Fetch the Excel file (from URL or local path)
+    xf = if startswith(path, "http://") || startswith(path, "https://")
         response = HTTP.get(path)
-
-        # Ensure the request was successful
         if response.status != 200
             error("Failed to fetch the Excel file: HTTP status code ", response.status)
         end
-
-        # Read the Excel data from the fetched content
-        xf = XLSX.readxlsx(IOBuffer(response.body))
+        XLSX.readxlsx(IOBuffer(response.body))
     else
-        # Read from a local file
-        xf = XLSX.readxlsx(path)
+        XLSX.readxlsx(path)
     end
-    # Determine the sheet to read from
+
+    # Determine which sheet to read
     sheet_to_read = isnothing(sheet) ? first(XLSX.sheetnames(xf)) : sheet
 
-    # Read the specified range or the entire sheet if range is not specified
-    if isnothing(range)
-        data = XLSX.eachtablerow(xf[sheet_to_read]) |> DataFrame
-    else
-        data = XLSX.readdata(path, sheet_to_read, range) |> DataFrame
-    end
+    # Read the table data from the specified range or full sheet
+    table_data = XLSX.gettable(xf[sheet_to_read])
+    data = DataFrame(table_data)
 
-    # Initial column name processing
-    if col_names == true && !isnothing(range)
-        col_names_row = XLSX.readdata(path, sheet_to_read, replace(range, r"[0-9]+:[0-9]+$" => "1:1"))[1, :]
-        rename!(data, Symbol.(col_names_row))
-        data = data[2:end, :]
-    elseif col_names != true && col_names != false
-        rename!(data, Symbol.(col_names))
-    elseif col_names == false
-        rename!(data, Symbol.(:auto))
+    # Infer and apply column types based on the first 5 rows
+    for col in names(data)
+        col_values = data[!, col]
+        inferred_type = infer_column_type(col_values)
+        data[!, col] = convert_column(col_values, inferred_type)
     end
 
     # Skipping rows
     if skip > 0
         data = data[(skip+1):end, :]
     end
 
-    # Limiting number of rows
+    # Limiting the number of rows
     if !isinf(n_max)
         data = data[1:min(n_max, nrow(data)), :]
     end
 
+    # Replace missing strings with `missing` if applicable
     if !isempty(missingstring)
         for missing_value in missingstring
             for col in names(data)
-                # Apply replacement on the entire column for each missing string value
                 data[!, col] = replace(data[!, col], missing_value => missing)
             end
         end
     end
 
-    # Trim whitespace
+    # Trim whitespace if requested
     if trim_ws
         for col in names(data)
             if eltype(data[!, col]) == String
@@ -112,11 +104,6 @@ function read_xlsx(
         end
     end
 
-    # Automatic type conversion based on inferred types
-    for col in names(data)
-        data[!, col] = convert_column(data[!, col])
-    end
-
     return data
 end