From dd977ca1d89d933682d6048d716e939ace7e459f Mon Sep 17 00:00:00 2001 From: Hemanth HM Date: Mon, 30 Dec 2024 21:30:22 -0800 Subject: [PATCH] Add serve command to start CORS enabled Flask server Add a `serve` command to start a CORS-enabled Flask server for file conversion. * **New Flask Server**: Add `src/markitdown/server.py` to define a Flask server with CORS enabled and a route to convert files to markdown. * **Dependencies**: Update `pyproject.toml` to include `flask` and `flask-cors` as dependencies and add the `serve` command to the `[project.scripts]` section. * **Documentation**: Update `README.md` with instructions on how to use the `serve` command. * **Tests**: Add tests in `tests/test_markitdown.py` to verify the functionality of the `serve` command, including handling both URL and file POST data. In case we decide not to have `serve` command we have [markdown-converter](https://pypi.org/project/markdown-converter/). --- README.md | 17 +++++++++++++++++ pyproject.toml | 3 +++ src/markitdown/server.py | 24 ++++++++++++++++++++++++ tests/test_markitdown.py | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+) create mode 100644 src/markitdown/server.py diff --git a/README.md b/README.md index d2314c3..fbc9d12 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,23 @@ print(result.text_content) docker build -t markitdown:latest . docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md ``` + +### Serve Command + +You can start a CORS-enabled Flask server to convert files to markdown using the `serve` command: + +```sh +markitdown serve +``` + +The server will be available at `http://localhost:5000`. You can send a POST request to the `/convert` endpoint with a file to convert it to markdown. + +Example using `curl`: + +```sh +curl -X POST -F 'file=@path-to-file.pdf' http://localhost:5000/convert +``` +
Batch Processing Multiple Files diff --git a/pyproject.toml b/pyproject.toml index 3e14cec..8e156f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,8 @@ dependencies = [ "pathvalidate", "charset-normalizer", "openai", + "flask", + "flask-cors", ] [project.urls] @@ -52,6 +54,7 @@ path = "src/markitdown/__about__.py" [project.scripts] markitdown = "markitdown.__main__:main" +serve = "markitdown.server:app.run" [tool.hatch.envs.types] extra-dependencies = [ diff --git a/src/markitdown/server.py b/src/markitdown/server.py new file mode 100644 index 0000000..55ede9f --- /dev/null +++ b/src/markitdown/server.py @@ -0,0 +1,24 @@ +from flask import Flask, request, jsonify +from flask_cors import CORS +from markitdown import MarkItDown + +app = Flask(__name__) +CORS(app) + +markitdown = MarkItDown() + +@app.route('/convert', methods=['POST']) +def convert(): + if 'file' in request.files: + file = request.files['file'] + result = markitdown.convert(file.stream, file_extension=file.filename.split('.')[-1]) + return jsonify({'content': result.text_content}) + elif 'url' in request.form: + url = request.form['url'] + result = markitdown.convert(url) + return jsonify({'content': result.text_content}) + else: + return jsonify({'error': 'No file or URL provided'}), 400 + +if __name__ == '__main__': + app.run(host='0.0.0.0', port=5000) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..b494dd8 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -130,6 +130,11 @@ "5bda1dd6", ] +# New test strings for the serve command +SERVE_TEST_STRINGS = [ + "While there is contemporaneous exploration of multi-agent approaches" +] + # --- Helper Functions --- def validate_strings(result, expected_strings, exclude_strings=None): @@ -300,6 +305,32 @@ def test_markitdown_llm() -> None: assert test_string in result.text_content.lower() +# New test for the serve command +def test_markitdown_serve() -> None: + from src.markitdown.server import app + + client = app.test_client() + + # Test with file + response = client.post( + "/convert", + data={"file": (io.BytesIO(b"test content"), "test.pdf")}, + content_type="multipart/form-data", + ) + assert response.status_code == 200 + for test_string in SERVE_TEST_STRINGS: + assert test_string in response.json["content"] + + # Test with URL + response = client.post( + "/convert", + data={"url": PDF_TEST_URL}, + ) + assert response.status_code == 200 + for test_string in SERVE_TEST_STRINGS: + assert test_string in response.json["content"] + + if __name__ == "__main__": """Runs this file's tests from the command line.""" test_markitdown_remote() @@ -307,3 +338,4 @@ def test_markitdown_llm() -> None: test_markitdown_exiftool() test_markitdown_deprecation() test_markitdown_llm() + test_markitdown_serve()