Skip to content

Commit

Permalink
Merge pull request #19 from TidierOrg/add-list_files
Browse files Browse the repository at this point in the history
add `list_files`
  • Loading branch information
drizk1 authored Sep 9, 2024
2 parents a57a5b4 + a675015 commit 7075aee
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 72 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "TidierFiles"
uuid = "8ae5e7a9-bdd3-4c93-9cc3-9df4d5d947db"
authors = ["Daniel Rizk <[email protected]> and contributors"]
version = "0.1.4"
version = "0.1.5"

[deps]
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Currently supported file types:
Agnostic read and write functions that detect the type and dispatch the appropriate function.
- `read_file` and `write_file`

`list_files` to list files in a directory.

# Examples

Here is an example of how to write and read a CSV file.
Expand Down
2 changes: 2 additions & 0 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ Currently supported file types:
Agnostic read and write functions that detect the type and dispatch the appropriate function.
- `read_file` and `write_file`

`list_files` to list files in a directory.

# Examples

Here is an example of how to write and read a CSV file.
Expand Down
21 changes: 19 additions & 2 deletions src/TidierFiles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ using RData

export read_csv, write_csv, read_tsv, write_tsv, read_table, write_table, read_delim, read_xlsx, write_xlsx,
read_fwf, write_fwf, fwf_empty, fwf_positions, fwf_positions, read_sav, read_sas, read_dta, write_sav, write_sas,
write_dta, read_arrow, write_arrow, read_parquet, write_parquet, read_csv2, read_file, write_file, read_rdata
write_dta, read_arrow, write_arrow, read_parquet, write_parquet, read_csv2, read_file, write_file, read_rdata, list_files


include("docstrings.jl")
include("fwf.jl")
Expand Down Expand Up @@ -448,4 +448,21 @@ end

include("gen_fxn.jl")

"""
$docstring_list_files
"""
function list_files(path = "", pattern = "")
files = map(walkdir(path)) do (root, dirs, files)
joinpath.(root, files)
end
if isempty(files)
error("No files ending with $pattern located at $path")
else
files = reduce(vcat, files)
files = filter(x -> occursin(pattern, x), files)
end
return files
end


end
26 changes: 20 additions & 6 deletions src/docstrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,12 @@ julia> write_xlsx(("REPORT_A" => df, "REPORT_B" => df2); path="xlsxtest.xlsx", o
julia> read_xlsx("xlsxtest.xlsx", sheet = "REPORT_A", skip = 1, n_max = 4, missingstring = [2])
3×3 DataFrame
Row │ integers strings floats
Any String Float64
─────┼─────────────────────────────────────────
1 │ missing Package makes 20.3
2 │ 3 File reading/writing 30.4
3 │ 4 even smoother 40.5
Row │ integers strings floats
Int64? String? Float64?
─────┼─────────────────────────────────────────
1 │ missing Package makes 20.3
2 │ 3 File reading/writing 30.4
3 │ 4 even smoother 40.5
```
"""

Expand Down Expand Up @@ -473,6 +473,7 @@ julia> write_sav(df, "test.por")
2 │ por 10.2
```
"""

const docstring_write_sas =
"""
write_sas(df, path)
Expand Down Expand Up @@ -674,4 +675,17 @@ Read `.rdata` and `.rds` files as DataFrame. `.rdata` files will result in a `Di
# Arguments
- `path`: A string with the file location. This does not yet support reading from URLs.
"""

const docstring_list_files =
"""
list_files(path = "", pattern = "")
List all files in a directory that match a given pattern.
# Arguments
- `path`: The directory path to list files from. Defaults to an empty string.
- `pattern`: A string pattern to filter the files. Defaults to an empty string, matching all files. ie `.csv` will only return files ending in .csv
# Examples
- `list_files("/path/to/folder/", ".csv")`
"""
113 changes: 50 additions & 63 deletions src/xlfiles.jl
Original file line number Diff line number Diff line change
@@ -1,37 +1,43 @@
function infer_type(value)
if isa(value, Missing)
return Missing
elseif isa(value, Number)
if isa(value, Int) || isa(value, Bool)
return Int
else
return Float64
end
elseif isa(value, DateTime)
return DateTime
elseif isa(value, Time)
return Time
elseif isa(value, Date)
function infer_column_type(values)
nonmissing_values = filter(x -> x !== missing, values)
first_values = nonmissing_values[1:min(5, length(nonmissing_values))]

# Check if all values are already integers
if all(x -> isa(x, Int), first_values)
return Int
# Check if all values are already floats
elseif all(x -> isa(x, Float64), first_values)
return Float64
# Check if all values are integers or can be parsed as integers
elseif all(x -> isa(x, Int) || tryparse(Int, string(x)) !== nothing, first_values)
return Int
# Check if all values are floats or can be parsed as floats
elseif all(x -> isa(x, Float64) || tryparse(Float64, string(x)) !== nothing, first_values)
return Float64
# Check if all values are dates or can be parsed as dates
elseif all(x -> isa(x, Date) || tryparse(Date, string(x), dateformat"yyyy-mm-dd") !== nothing, first_values)
return Date
# Default to String
else
return String
end
end

function convert_column(column)
non_missing_values = filter(!ismissing, column)
if isempty(non_missing_values)
return column # Return as-is if all values are missing
end

target_type = reduce((x, y) -> x === y ? x : String, map(infer_type, non_missing_values))
try
return target_type == Missing ? column : convert(Vector{target_type}, column)
catch
return column # Fallback to original if conversion fails
# Function to convert a column to the inferred type
# Function to convert a column to the inferred type
function convert_column(col, inferred_type)
if inferred_type == Int
return [x === missing ? missing : isa(x, Int) ? x : tryparse(Int, string(x)) for x in col]
elseif inferred_type == Float64
return [x === missing ? missing : isa(x, Float64) ? x : tryparse(Float64, string(x)) for x in col]
elseif inferred_type == Date
return [x === missing ? missing : isa(x, Date) ? x : tryparse(Date, string(x), dateformat"yyyy-mm-dd") for x in col]
else
return [x === missing ? missing : convert(String, x) for x in col]
end
end


"""
$docstring_read_xlsx
"""
Expand All @@ -40,70 +46,56 @@ function read_xlsx(
sheet = nothing,
range = nothing,
col_names = true,
col_types = nothing,
missingstring = "",
trim_ws = true,
skip = 0,
n_max = Inf,
guess_max = nothing)


if startswith(path, "http://") || startswith(path, "https://")
# Fetch the content from the URL
n_max = Inf
)
# Fetch the Excel file (from URL or local path)
xf = if startswith(path, "http://") || startswith(path, "https://")
response = HTTP.get(path)

# Ensure the request was successful
if response.status != 200
error("Failed to fetch the Excel file: HTTP status code ", response.status)
end

# Read the Excel data from the fetched content
xf = XLSX.readxlsx(IOBuffer(response.body))
XLSX.readxlsx(IOBuffer(response.body))
else
# Read from a local file
xf = XLSX.readxlsx(path)
XLSX.readxlsx(path)
end
# Determine the sheet to read from

# Determine which sheet to read
sheet_to_read = isnothing(sheet) ? first(XLSX.sheetnames(xf)) : sheet

# Read the specified range or the entire sheet if range is not specified
if isnothing(range)
data = XLSX.eachtablerow(xf[sheet_to_read]) |> DataFrame
else
data = XLSX.readdata(path, sheet_to_read, range) |> DataFrame
end
# Read the table data from the specified range or full sheet
table_data = XLSX.gettable(xf[sheet_to_read])
data = DataFrame(table_data)

# Initial column name processing
if col_names == true && !isnothing(range)
col_names_row = XLSX.readdata(path, sheet_to_read, replace(range, r"[0-9]+:[0-9]+$" => "1:1"))[1, :]
rename!(data, Symbol.(col_names_row))
data = data[2:end, :]
elseif col_names != true && col_names != false
rename!(data, Symbol.(col_names))
elseif col_names == false
rename!(data, Symbol.(:auto))
# Infer and apply column types based on the first 5 rows
for col in names(data)
col_values = data[!, col]
inferred_type = infer_column_type(col_values)
data[!, col] = convert_column(col_values, inferred_type)
end

# Skipping rows
if skip > 0
data = data[(skip+1):end, :]
end

# Limiting number of rows
# Limiting the number of rows
if !isinf(n_max)
data = data[1:min(n_max, nrow(data)), :]
end

# Replace missing strings with `missing` if applicable
if !isempty(missingstring)
for missing_value in missingstring
for col in names(data)
# Apply replacement on the entire column for each missing string value
data[!, col] = replace(data[!, col], missing_value => missing)
end
end
end

# Trim whitespace
# Trim whitespace if requested
if trim_ws
for col in names(data)
if eltype(data[!, col]) == String
Expand All @@ -112,11 +104,6 @@ function read_xlsx(
end
end

# Automatic type conversion based on inferred types
for col in names(data)
data[!, col] = convert_column(data[!, col])
end

return data
end

Expand Down

2 comments on commit 7075aee

@drizk1
Copy link
Member Author

@drizk1 drizk1 commented on 7075aee Sep 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register

Release notes:
-adds list_files("path", "suffix") to list all files in a location

  • fixes type parsing issue where read_xlsx would read most types as Any

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/114857

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.1.5 -m "<description of version>" 7075aee8bf5bd3f7319ce9c047861176ae00ef97
git push origin v0.1.5

Please sign in to comment.