Skip to content

Commit

Permalink
Merge pull request #1614 from opendatalab/release-1.1.0
Browse files Browse the repository at this point in the history
Release 1.1.0
  • Loading branch information
myhloli authored Jan 23, 2025
2 parents 4d70b16 + adcace4 commit 19f72c2
Show file tree
Hide file tree
Showing 28 changed files with 542 additions and 271 deletions.
19 changes: 14 additions & 5 deletions README.md

Large diffs are not rendered by default.

19 changes: 14 additions & 5 deletions README_zh-CN.md

Large diffs are not rendered by default.

7 changes: 3 additions & 4 deletions docker/ascend_npu/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
boto3>=1.28.43
Brotli>=1.1.0
click>=8.1.7
PyMuPDF>=1.24.9
PyMuPDF>=1.24.9,<=1.24.14
loguru>=0.6.0
numpy>=1.21.6,<2.0.0
fast-langdetect>=0.2.3,<0.3.0
Expand All @@ -17,10 +17,9 @@ paddlepaddle==3.0.0b1
struct-eqtable==0.3.2
einops
accelerate
doclayout_yolo==0.0.2
rapidocr-paddle
rapidocr-onnxruntime
rapid_table==0.3.0
doclayout-yolo==0.0.2
rapid-table>=1.0.3,<2.0.0
doclayout-yolo==0.0.2b1
openai
detectron2
7 changes: 3 additions & 4 deletions docker/china/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
boto3>=1.28.43
Brotli>=1.1.0
click>=8.1.7
PyMuPDF>=1.24.9
PyMuPDF>=1.24.9,<=1.24.14
loguru>=0.6.0
numpy>=1.21.6,<2.0.0
fast-langdetect>=0.2.3,<0.3.0
Expand All @@ -16,10 +16,9 @@ paddleocr==2.7.3
struct-eqtable==0.3.2
einops
accelerate
doclayout_yolo==0.0.2
rapidocr-paddle
rapidocr-onnxruntime
rapid_table==0.3.0
doclayout-yolo==0.0.2
rapid-table>=1.0.3,<2.0.0
doclayout-yolo==0.0.2b1
openai
detectron2
7 changes: 3 additions & 4 deletions docker/global/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
boto3>=1.28.43
Brotli>=1.1.0
click>=8.1.7
PyMuPDF>=1.24.9
PyMuPDF>=1.24.9,<=1.24.14
loguru>=0.6.0
numpy>=1.21.6,<2.0.0
fast-langdetect>=0.2.3,<0.3.0
Expand All @@ -16,10 +16,9 @@ paddleocr==2.7.3
struct-eqtable==0.3.2
einops
accelerate
doclayout_yolo==0.0.2
rapidocr-paddle
rapidocr-onnxruntime
rapid_table==0.3.0
doclayout-yolo==0.0.2
rapid-table>=1.0.3,<2.0.0
doclayout-yolo==0.0.2b1
openai
detectron2
3 changes: 2 additions & 1 deletion magic-pdf.template.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
},
"table-config": {
"model": "rapid_table",
"sub_model": "slanet_plus",
"enable": true,
"max_time": 400
},
Expand All @@ -39,5 +40,5 @@
"enable": false
}
},
"config_version": "1.1.0"
"config_version": "1.1.1"
}
7 changes: 5 additions & 2 deletions magic_pdf/libs/boxbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,13 @@ def calculate_iou(bbox1, bbox2):
bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])

if any([bbox1_area == 0, bbox2_area == 0]):
return 0

# Compute the intersection over union by taking the intersection area
# and dividing it by the sum of both areas minus the intersection area
iou = intersection_area / float(bbox1_area + bbox2_area -
intersection_area)
iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area)

return iou


Expand Down
16 changes: 14 additions & 2 deletions magic_pdf/libs/draw_bbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,12 +362,24 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
for page in pdf_info:
page_line_list = []
for block in page['preproc_blocks']:
if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
if block['type'] in [BlockType.Text]:
for line in block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
if block['type'] in [BlockType.Image, BlockType.Table]:
elif block['type'] in [BlockType.Title, BlockType.InterlineEquation]:
if 'virtual_lines' in block:
if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None:
for line in block['virtual_lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
else:
for line in block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
elif block['type'] in [BlockType.Image, BlockType.Table]:
for sub_block in block['blocks']:
if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:
Expand Down
9 changes: 9 additions & 0 deletions magic_pdf/libs/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,20 @@
from fast_langdetect import detect_language


def remove_invalid_surrogates(text):
# 移除无效的 UTF-16 代理对
return ''.join(c for c in text if not (0xD800 <= ord(c) <= 0xDFFF))


def detect_lang(text: str) -> str:

if len(text) == 0:
return ""

text = text.replace("\n", "")
text = remove_invalid_surrogates(text)

# print(text)
try:
lang_upper = detect_language(text)
except:
Expand All @@ -37,3 +45,4 @@ def detect_lang(text: str) -> str:
print(detect_lang("<html>This is a test</html>"))
print(detect_lang("这个是中文测试。"))
print(detect_lang("<html>这个是中文测试。</html>"))
print(detect_lang("〖\ud835\udc46\ud835〗这是个包含utf-16的中文测试"))
Loading

0 comments on commit 19f72c2

Please sign in to comment.