From 127797b296081ee7f6817616b338cf17fadffed7 Mon Sep 17 00:00:00 2001 From: bql <72600300+baiqinglun@users.noreply.github.com> Date: Mon, 19 Feb 2024 16:44:54 +0800 Subject: [PATCH] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 29 ++++++++++++++++++++++ doi.csv | 3 +++ main.py | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ run.bat | 3 +++ 4 files changed, 108 insertions(+) create mode 100644 README.md create mode 100644 doi.csv create mode 100644 main.py create mode 100644 run.bat diff --git a/README.md b/README.md new file mode 100644 index 0000000..0697645 --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +通过文献的doi使用多线程下载文献 + +## 使用说明 + +### 获取并更改headers + +进入网址 https://www.sci-hub.ee/ 后按F12进入开发选项,然后刷新网页 + +![](https://test-123456-md-images.oss-cn-beijing.aliyuncs.com/img/20240219163328.png) + +然后填在headers内 + +![](https://test-123456-md-images.oss-cn-beijing.aliyuncs.com/img/20240219163404.png) + +### 输入doi + +打开doi.csv文件,并输入doi号 + +![](https://test-123456-md-images.oss-cn-beijing.aliyuncs.com/img/20240219163509.png) + +### 更改python安装目录 + +![](https://test-123456-md-images.oss-cn-beijing.aliyuncs.com/img/20240219163613.png) + +### 运行下载 + +点击run.bat开始下载 + +![](https://test-123456-md-images.oss-cn-beijing.aliyuncs.com/img/20240219163716.png) \ No newline at end of file diff --git a/doi.csv b/doi.csv new file mode 100644 index 0000000..0e51d40 --- /dev/null +++ b/doi.csv @@ -0,0 +1,3 @@ +doi +10.1136/gut.2009.182170 +10.3390/Brainsci10110868 \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..2c25dc9 --- /dev/null +++ b/main.py @@ -0,0 +1,73 @@ +import requests +import re +import os +import urllib.request +import pandas as pd +import threading +import time +from tqdm import tqdm + +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', +} + +def getPaperPdf(doi): + # 下载文献 + sci_Hub_Url = "https://www.sci-hub.ee/" + paper_url = sci_Hub_Url + doi + pattern = '/.*?\.pdf' + content = requests.get(paper_url, headers=headers) + download_url = re.findall(pattern, content.text) + download_url[1] = "https:" + download_url[1] + path = r"paper" + if not os.path.exists(path): + os.makedirs(path) + + req = urllib.request.Request(download_url[1], headers=headers) + u = urllib.request.urlopen(req, timeout=5) + + # 获取文献名称 + title_base_url = "https://api.crossref.org/works/" + url = title_base_url + doi + response = requests.get(url) + title = "" + if response.status_code == 200: + data = response.json() + title = data["message"]["title"][0] + title = title.replace(':', '') + else: + print("Failed to fetch title for DOI:", doi) + + # 写入 + file_name = title + ".pdf" + f = open(os.path.join(path, file_name), 'wb') + block_sz = 8192 + while True: + buffer = u.read(block_sz) + if not buffer: + break + f.write(buffer) + f.close() + print("Successful to download" + " " + file_name) + +# 开启多线程 +def download_papers(df): + threads = [] + for doi in df['doi']: + thread = threading.Thread(target=getPaperPdf, args=(doi,)) + threads.append(thread) + thread.start() + + # 等待所有线程完成 + for thread in threads: + thread.join() + +if __name__ == '__main__': + print("下载中......................................") + start_time = time.time() # 记录开始下载的时间戳 + df = pd.read_csv('doi.csv') + download_papers(df) + end_time = time.time() # 记录结束下载的时间戳 + download_time = end_time - start_time # 计算下载所花费的时间 + print("总下载时间:",download_time) + print("下载完成") \ No newline at end of file diff --git a/run.bat b/run.bat new file mode 100644 index 0000000..89ce9f0 --- /dev/null +++ b/run.bat @@ -0,0 +1,3 @@ +@echo off +D:\anaconda3\python.exe .\main.py +pause