Skip to content

Commit

Permalink
Merge pull request #8 from gustmdqkr321/crawling
Browse files Browse the repository at this point in the history
Crawling
  • Loading branch information
gustmdqkr321 authored Mar 13, 2024
2 parents 77f0f1e + 66e62cd commit 26d39dd
Showing 1 changed file with 33 additions and 12 deletions.
45 changes: 33 additions & 12 deletions back/crawling_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import re
import os
from detect import count_class
from tqdm import tqdm

## 책상 종류별 크롤링

headers = {
Expand All @@ -16,7 +18,7 @@ def get_desks(query, page=1):
"query": query,
"search_affect_type": "CuratedLink",
"page": page,
"per": 20
"per": 20,
}

response = requests.get(api_url, params=params, headers=headers)
Expand All @@ -30,31 +32,32 @@ def get_desks(query, page=1):
{
"id": fetched_desk["id"],
"name": fetched_desk["name"],
"image_url": fetched_desk["original_image_url"]
"image_url": fetched_desk["original_image_url"],
}
)
return desks


def sanitize_filename(filename):
sanitized_filename = re.sub(r'[\/\\\:\*\?\"\<\>\|]', '_', filename)
sanitized_filename = ''.join(c for c in sanitized_filename if c.isprintable())
sanitized_filename = re.sub(r"[\/\\\:\*\?\"\<\>\|]", "_", filename)
sanitized_filename = "".join(c for c in sanitized_filename if c.isprintable())
return sanitized_filename.replace(" ", "_")


def download_image(url, file_name, download_folder):
def download_image(url, file_name, file_extension, download_folder):
response = requests.get(url)
if response.status_code == 200:
with open(f"{download_folder}/{file_name}.png", 'wb') as f:
with open(f"{download_folder}/{file_name}.{file_extension}", "wb") as f:
f.write(response.content)


def Process_image_by_number_of_objects(data_dir):
for folder_name in os.listdir(data_dir):
folder_path = os.path.join(data_dir, folder_name)
# 해당 폴더가 디렉터리인지 확인
if os.path.isdir(folder_path):
print("Processing folder:", folder_name)

# 각 이미지 파일에 대해 클래스별 객체 수 확인
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
Expand All @@ -65,18 +68,36 @@ def Process_image_by_number_of_objects(data_dir):
if class_counts <= 2:
print("Deleting image:", file_name)
os.remove(file_path)


if __name__ == "__main__":
base_download_folder = "./train_image"
queries = ["독서실책상", "컴퓨터책상", "일자형책상", "코너형책상", "h형책상"] # 책상 종류

queries = [
"독서실책상",
"컴퓨터책상",
"일자형책상",
"코너형책상",
"h형책상",
] # 책상 종류

# queries = ["독서실책상"]

for query in queries:
download_folder = f"{base_download_folder}/{query}"
if not os.path.exists(download_folder):
os.makedirs(download_folder)

for page in range(1, 15): # 페이지 수 조절
for page in tqdm(range(1, 15), desc=f"Processing {query}"): # 페이지 수 조절
desks = get_desks(query, page=page)
for desk in desks:
download_image(desk["image_url"], sanitize_filename(desk["name"]), download_folder)
data_dir = "./train_image"
Process_image_by_number_of_objects(data_dir)
file_extension = os.path.splitext(desk["image_url"])[1]
download_image(
desk["image_url"],
sanitize_filename(desk["name"]),
file_extension,
download_folder,
)

# data_dir = "./train_image"
# Process_image_by_number_of_objects(data_dir)

0 comments on commit 26d39dd

Please sign in to comment.