Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add remote upload ability and csv upload ability #289

Merged
merged 4 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ lzstring = "^1.0.4"
azure-ai-documentintelligence = { version = "^1.0.0b4", optional = true }
litellm = "^1.51.0"
pydantic = "^2.9.2"
httpx = { version = "^0.28.1", optional = true }

[tool.poetry.extras]
parsing = ["python-docx", "openpyxl", "pydub", "python-pptx", "azure-ai-documentintelligence", "paddlepaddle", "pymupdf"]
server = ["fastapi", "uvicorn", "docling", "azure-ai-formrecognizer", "azure-ai-documentintelligence"]
server = ["fastapi", "uvicorn", "docling", "azure-ai-formrecognizer", "azure-ai-documentintelligence", "httpx"]

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.2"
Expand Down
130 changes: 124 additions & 6 deletions server/app/routes/filesystem.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from fastapi import APIRouter, UploadFile, File, Form, HTTPException
from fastapi.responses import FileResponse, JSONResponse
from typing import List, Optional
from typing import List, Optional, Union
import os
import yaml
import shutil
import httpx
import json
import csv
from io import StringIO
from pathlib import Path
from server.app.models import PipelineConfigRequest

Expand Down Expand Up @@ -33,19 +37,133 @@ async def check_namespace(namespace: str):
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to check/create namespace: {str(e)}")

def validate_json_content(content: bytes) -> None:
"""Validate that content can be parsed as JSON"""
try:
json.loads(content)
except json.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON format: {str(e)}")

def convert_csv_to_json(csv_content: bytes) -> bytes:
"""Convert CSV content to JSON format"""
try:
# Decode bytes to string and create a StringIO object
csv_string = csv_content.decode('utf-8')
csv_file = StringIO(csv_string)

# Read CSV and convert to list of dictionaries
reader = csv.DictReader(csv_file)
data = list(reader)

if not data:
raise HTTPException(status_code=400, detail="CSV file is empty")

# Convert back to JSON bytes
return json.dumps(data).encode('utf-8')
except UnicodeDecodeError:
raise HTTPException(status_code=400, detail="Invalid CSV encoding")
except csv.Error as e:
raise HTTPException(status_code=400, detail=f"Invalid CSV format: {str(e)}")

def is_likely_csv(content: bytes, filename: str) -> bool:
"""Check if content is likely to be CSV based on content and filename"""
# Check filename extension
if filename.lower().endswith('.csv'):
return True

# If no clear extension, try to detect CSV content
try:
# Take first line and check if it looks like CSV
first_line = content.split(b'\n')[0].decode('utf-8')
# Check if line contains commas and no obvious JSON characters
return ',' in first_line and not any(c in first_line for c in '{}[]')
except:
return False

@router.post("/upload-file")
async def upload_file(file: UploadFile = File(...), namespace: str = Form(...)):
"""Upload a single file to the namespace files directory"""
async def upload_file(
file: Optional[UploadFile] = File(None),
url: Optional[str] = Form(None),
namespace: str = Form(...)
):
"""Upload a file to the namespace files directory, either from a direct upload or a URL"""
try:
if not file and not url:
raise HTTPException(status_code=400, detail="Either file or url must be provided")

upload_dir = get_namespace_dir(namespace) / "files"
upload_dir.mkdir(parents=True, exist_ok=True)

file_path = upload_dir / file.filename
with file_path.open("wb") as f:
shutil.copyfileobj(file.file, f)
if url:
# Get filename from URL or default to dataset.json
filename = url.split("/")[-1] or "dataset.json"

file_path = upload_dir / filename.replace('.csv', '.json')

# Handle URL download
async with httpx.AsyncClient() as client:
async with client.stream(
'GET',
url,
follow_redirects=True,
) as response:
if response.status_code != 200:
raise HTTPException(
status_code=400,
detail=f"Failed to download from URL: {response.status_code}"
)

# Save the file in chunks
content_chunks = []
async for chunk in response.aiter_bytes(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
content_chunks.append(chunk)

# Combine chunks
content = b''.join(content_chunks)

# Check if content is CSV and convert if needed
if is_likely_csv(content, filename):
try:
content = convert_csv_to_json(content)
except HTTPException as e:
raise HTTPException(
status_code=400,
detail=f"Failed to convert CSV to JSON: {str(e.detail)}"
)

# Validate JSON content
validate_json_content(content)

# Write to file
with file_path.open("wb") as f:
f.write(content)
else:
# Handle direct file upload
file_content = await file.read()

# Check if content is CSV and convert if needed
if file.filename.lower().endswith('.csv'):
try:
file_content = convert_csv_to_json(file_content)
except HTTPException as e:
raise HTTPException(
status_code=400,
detail=f"Failed to convert CSV to JSON: {str(e.detail)}"
)

# Validate JSON content
validate_json_content(file_content)

# Always save as .json
file_path = upload_dir / file.filename.replace('.csv', '.json')
with file_path.open("wb") as f:
f.write(file_content)

return {"path": str(file_path)}
except Exception as e:
if isinstance(e, HTTPException):
raise e
raise HTTPException(status_code=500, detail=f"Failed to upload file: {str(e)}")

@router.post("/save-documents")
Expand Down
111 changes: 0 additions & 111 deletions todos.md

This file was deleted.

12 changes: 0 additions & 12 deletions vision.md

This file was deleted.

Loading
Loading