diff --git a/001-Downloader/README.md b/001-Downloader/README.md index c3cd49b..57c78a0 100644 --- a/001-Downloader/README.md +++ b/001-Downloader/README.md @@ -1,9 +1,9 @@ # 资源下载器 -本项目主要通过网络上开源的项目聚合成了一个跨平台的下载工具,可批量下载抖音、快手和YouTube视音频资源。下载地址: +本项目主要通过网络上开源的项目聚合成了一个跨平台的下载工具,可批量下载抖音、快手视音频资源。下载地址: -MacOS:[Downloader1.0.1-mac](https://github.com/xhunmon/PythonIsTools/releases/download/v1.0.1/downloader1.0.1-mac) +MacOS:[Downloader1.0.3-mac](https://github.com/xhunmon/PythonIsTools/releases/download/v1.0.3/Downloader1.0.3.app) -Window:[downloader1.0.1-window.exe](https://github.com/xhunmon/PythonIsTools/releases/download/v1.0.1/downloader1.0.1-window.exe) +Window:[downloader1.0.1-window.exe](https://github.com/xhunmon/PythonIsTools/releases/download/v1.0.1/downloader1.0.1-window.exe) (已停用,待更新) 效果如图: @@ -25,6 +25,8 @@ pyinstaller -F -i res/logo.ico main.py -w #③:再次进行打包,参考installer-mac.sh pyinstaller -F -i res/logo.ico main.spec -w ``` +打包脚本与配置已放在 `doc` 目录下,需要拷贝出根目录进行打包。 + 注意: pyinstaller打包工具的版本与python版本、python所需第三方库以及操作系统会存在各种问题,所以需要看日志查找问题。例如:打包后运用,发现导入pyppeteer报错,通过降低版本后能正常使用:pip install pyppeteer==0.2.2 diff --git a/001-Downloader/config.ini b/001-Downloader/config.ini index 274383e..bcea080 100644 --- a/001-Downloader/config.ini +++ b/001-Downloader/config.ini @@ -1,10 +1,10 @@ # 常用配置模块 [common] #软件使用截止日期 -expired_time=2022/12/15 23:59:59 +expired_time=2025/12/15 23:59:59 #app的版本名称 -version_name=1.0.2 +version_name=1.0.3 #app的版本号 -version_code=102 \ No newline at end of file +version_code=103 \ No newline at end of file diff --git a/001-Downloader/doc/installer-window.sh b/001-Downloader/doc/installer-window.sh deleted file mode 100644 index d53c4c1..0000000 --- a/001-Downloader/doc/installer-window.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -pyinstaller -F -i res\\logo.ico -w main.spec main.py --p type_enum.py --p ui.py --p utils.py --p downloader.py --p douyin\\dy_download.py --p kuaishou\\ks_download.py --p pytube\\captions.py --p pytube\\cipher.py --p pytube\\cli.py --p pytube\\exceptions.py --p pytube\\extract.py --p pytube\\helpers.py --p pytube\\innertube.py --p pytube\\itags.py --p pytube\\metadata.py --p pytube\\monostate.py --p pytube\\parser.py --p pytube\\query.py --p pytube\\request.py --p pytube\\streams.py --p pytube\\version.py --p pytube\\__init__.py --p pytube\\__main__.py --p pytube\\contrib\\__init__.py --p pytube\\contrib\\channel.py --p pytube\\contrib\\playlist.py --p pytube\\contrib\\search.py \ No newline at end of file diff --git a/001-Downloader/main.spec b/001-Downloader/doc/mac-sh/main.spec similarity index 90% rename from 001-Downloader/main.spec rename to 001-Downloader/doc/mac-sh/main.spec index afd7f6d..6b54090 100644 --- a/001-Downloader/main.spec +++ b/001-Downloader/doc/mac-sh/main.spec @@ -38,3 +38,7 @@ exe = EXE(pyz, target_arch=None, codesign_identity=None, entitlements_file=None , icon='res/logo.ico') +app = BUNDLE(exe, + name='Downloader.app', + icon='res/logo.ico', + bundle_identifier=None) \ No newline at end of file diff --git a/001-Downloader/doc/mac-sh/pyinstaller.sh b/001-Downloader/doc/mac-sh/pyinstaller.sh new file mode 100644 index 0000000..ebc509e --- /dev/null +++ b/001-Downloader/doc/mac-sh/pyinstaller.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +pyinstaller -F -i res/logo.ico main.spec main.py -w \ +-p type_enum.py \ +-p ui.py \ +-p utils.py \ +-p downloader.py \ +-p douyin/dy_download.py \ +-p kuaishou/ks_download.py \ No newline at end of file diff --git a/001-Downloader/doc/main-window.spec b/001-Downloader/doc/win-sh/main.spec similarity index 90% rename from 001-Downloader/doc/main-window.spec rename to 001-Downloader/doc/win-sh/main.spec index 36e43cf..16366fb 100644 --- a/001-Downloader/doc/main-window.spec +++ b/001-Downloader/doc/win-sh/main.spec @@ -38,3 +38,7 @@ exe = EXE(pyz, target_arch=None, codesign_identity=None, entitlements_file=None , icon='res\\logo.ico') +app = BUNDLE(exe, + name='Downloader.exe', + icon='res\\logo.ico', + bundle_identifier=None) \ No newline at end of file diff --git a/001-Downloader/doc/win-sh/pyinstaller.sh b/001-Downloader/doc/win-sh/pyinstaller.sh new file mode 100644 index 0000000..d03605f --- /dev/null +++ b/001-Downloader/doc/win-sh/pyinstaller.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +pyinstaller -F -i res\\logo.ico -w main.spec main.py +-p type_enum.py +-p ui.py +-p utils.py +-p downloader.py +-p douyin\\dy_download.py +-p kuaishou\\ks_download.py \ No newline at end of file diff --git a/001-Downloader/douyin/dy_download.py b/001-Downloader/douyin/dy_download.py index bf0c2d5..77c976b 100644 --- a/001-Downloader/douyin/dy_download.py +++ b/001-Downloader/douyin/dy_download.py @@ -11,10 +11,8 @@ import os import re import time -from urllib import parse import requests -import requests_html from downloader import Downloader @@ -132,6 +130,7 @@ def get_data(self, api_post_url, max_cursor): Downloader.print_ui('[ 用户 ]:' + str(self.nickname) + '\r') max_cursor = html['max_cursor'] result = html['aweme_list'] + self.count = len(result) Downloader.print_ui('----抓获数据成功----\r') # 处理第一页视频信息 @@ -150,7 +149,8 @@ def next_data(self, max_cursor): return user_url = self.user # 获取用户sec_uid - key = re.findall('/user/(.*?)\?', str(user_url))[0] + # key = re.findall('/user/(.*?)\?', str(user_url))[0] + key = re.findall('/user/(.*?)$', str(user_url))[0] if not key: key = user_url[28:83] @@ -165,7 +165,8 @@ def next_data(self, max_cursor): self.end = True return index += 1 - Downloader.print_ui('----正在对' + max_cursor + '页进行第 %d 次尝试----\r' % index) + # Downloader.print_ui('----正在对' + max_cursor + '页进行第 %d 次尝试----\r' % index) + Downloader.print_ui('----正在对{}页进行第 {} 次尝试----\r'.format(max_cursor, index)) time.sleep(0.3) response = requests.get(url=api_naxt_post_url, headers=self.headers) html = json.loads(response.content.decode()) @@ -173,12 +174,13 @@ def next_data(self, max_cursor): # 下一页值 max_cursor = html['max_cursor'] result = html['aweme_list'] - Downloader.print_ui('----' + max_cursor + '页抓获数据成功----\r') + self.count = len(result) + Downloader.print_ui('----{}页抓获数据成功----\r'.format(max_cursor)) # 处理下一页视频信息 self.video_info(result, max_cursor) else: - self.end == True - Downloader.print_ui('----' + max_cursor + '页抓获数据失败----\r') + self.end = True + Downloader.print_ui('----{}页抓获数据失败----\r'.format(max_cursor)) # sys.exit() # 处理视频信息 @@ -224,40 +226,47 @@ def videos_download(self, count, author_list, video_list, aweme_id, nickname, ma except: pass Downloader.add_downloading_count() - try: - jx_url = f'https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/?item_ids={aweme_id[i]}' # 官方接口 - js = json.loads(requests.get(url=jx_url, headers=self.headers).text) - music_url = str(js['item_list'][0]['music']['play_url']['url_list'][0]) - music_title = str(js['item_list'][0]['music']['author']) - if self.musicarg == "yes": # 保留音频 - music = requests.get(music_url) # 保存音频 - start = time.time() # 下载开始时间 - size = 0 # 初始化已下载大小 - chunk_size = 1024 # 每次下载的数据大小 - content_size = int(music.headers['content-length']) # 下载文件总大小 - if music.status_code == 200: # 判断是否响应成功 - Downloader.print_ui('[ 音频 ]:' + author_list[i] + '[文件 大小]:{size:.2f} MB'.format( - size=content_size / chunk_size / 1024)) # 开始下载,显示下载文件大小 - # m_url = pre_save + music_title + '-[' + author_list[i] + '].mp3' - m_url = os.path.join(pre_save, - nickname[i] + "-" + music_title + '-[' + author_list[i] + '].mp3') - Downloader.print_ui("路径:" + m_url) - with open(m_url, 'wb') as file: # 显示进度条 - for data in music.iter_content(chunk_size=chunk_size): - file.write(data) - size += len(data) - Downloader.print_ui('\r' + music_title + '\n[下载进度]:%s%.2f%%' % ( - '>' * int(size * 50 / content_size), float(size / content_size * 100))) - end = time.time() # 下载结束时间 - Downloader.print_ui('\n' + music_title + '\n[下载完成]:耗时: %.2f秒\n' % (end - start)) # 输出下载用时时间 - Downloader.add_success_count() - except Exception as error: - # Downloader.print_ui2(error) - Downloader.print_ui('该页音频没有' + str(self.count) + '个,已为您跳过\r') - Downloader.add_failed_count() - break + # try: + # jx_url = f'https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/?item_ids={aweme_id[i]}' # 官方接口 + # js = json.loads(requests.get(url=jx_url, headers=self.headers).text) + # music_url = str(js['item_list'][0]['music']['play_url']['url_list'][0]) + # music_title = str(js['item_list'][0]['music']['author']) + # if self.musicarg == "yes": # 保留音频 + # music = requests.get(music_url) # 保存音频 + # start = time.time() # 下载开始时间 + # size = 0 # 初始化已下载大小 + # chunk_size = 1024 # 每次下载的数据大小 + # content_size = int(music.headers['content-length']) # 下载文件总大小 + # if music.status_code == 200: # 判断是否响应成功 + # Downloader.print_ui('[ 音频 ]:' + author_list[i] + '[文件 大小]:{size:.2f} MB'.format( + # size=content_size / chunk_size / 1024)) # 开始下载,显示下载文件大小 + # # m_url = pre_save + music_title + '-[' + author_list[i] + '].mp3' + # m_url = os.path.join(pre_save, + # nickname[i] + "-" + music_title + '-[' + author_list[i] + '].mp3') + # Downloader.print_ui("路径:" + m_url) + # with open(m_url, 'wb') as file: # 显示进度条 + # for data in music.iter_content(chunk_size=chunk_size): + # file.write(data) + # size += len(data) + # Downloader.print_ui('\r' + music_title + '\n[下载进度]:%s%.2f%%' % ( + # '>' * int(size * 50 / content_size), float(size / content_size * 100))) + # end = time.time() # 下载结束时间 + # Downloader.print_ui('\n' + music_title + '\n[下载完成]:耗时: %.2f秒\n' % (end - start)) # 输出下载用时时间 + # Downloader.add_success_count() + # except Exception as error: + # # Downloader.print_ui2(error) + # Downloader.print_ui('该页音频没有' + str(self.count) + '个\r') + # # Downloader.add_failed_count() + # # break try: + v_url = os.path.join(pre_save, nickname[i] + "-" + '[' + author_list[i] + '].mp4') + # 如果本地已经有了就跳过 + if os.path.exists(v_url): + Downloader.print_ui('{}-已存在!'.format(v_url)) + Downloader.add_success_count() + continue + video = requests.get(video_list[i], headers=self.headers) # 保存视频 start = time.time() # 下载开始时间 size = 0 # 初始化已下载大小 @@ -267,7 +276,7 @@ def videos_download(self, count, author_list, video_list, aweme_id, nickname, ma Downloader.print_ui( '[ 视频 ]:' + nickname[i] + '-' + author_list[i] + '[文件 大小]:{size:.2f} MB'.format( size=content_size / 1024 / 1024)) # 开始下载,显示下载文件大小 - v_url = os.path.join(pre_save, nickname[i] + "-" + '[' + author_list[i] + '].mp4') + # v_url = os.path.join(pre_save, nickname[i] + "-" + '[' + author_list[i] + '].mp4') # v_url = pre_save + '[' + author_list[i] + '].mp4' Downloader.print_ui("路径:" + v_url) with open(v_url, 'wb') as file: # 显示进度条 diff --git a/001-Downloader/downloader.py b/001-Downloader/downloader.py index 9cc3aa9..1270700 100644 --- a/001-Downloader/downloader.py +++ b/001-Downloader/downloader.py @@ -40,11 +40,10 @@ def print_hint(): Downloader.print_ui( """ 使用说明: - 1、youtube下载需要先让电脑连接外网,地址如:https://www.youtube.com/watch?v=jKhP750VdXw - 2、快手下载用户批量视频如:https://www.kuaishou.com/profile/xxx - 3、快手下载单条视频如:https://www.kuaishou.com/short-video/xxx - 4、抖音下载用户批量视频如:https://www.douyin.com/user/xxx - 5、抖音下载单条视频如:https://www.douyin.com/video/xxx + 1、快手下载用户批量视频如:https://www.kuaishou.com/profile/xxx + 2、快手下载单条视频如:https://www.kuaishou.com/short-video/xxx + 3、抖音下载用户批量视频如:https://www.douyin.com/user/xxx + 4、抖音下载单条视频如:https://www.douyin.com/video/xxx """ ) diff --git a/001-Downloader/installer-mac.sh b/001-Downloader/installer-mac.sh deleted file mode 100644 index 04a0216..0000000 --- a/001-Downloader/installer-mac.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -pyinstaller -F -i res/logo.ico main.spec main.py -w \ --p type_enum.py \ --p ui.py \ --p utils.py \ --p downloader.py \ --p douyin/dy_download.py \ --p kuaishou/ks_download.py \ --p pytube/captions.py \ --p pytube/cipher.py \ --p pytube/cli.py \ --p pytube/exceptions.py \ --p pytube/extract.py \ --p pytube/helpers.py \ --p pytube/innertube.py \ --p pytube/itags.py \ --p pytube/metadata.py \ --p pytube/monostate.py \ --p pytube/parser.py \ --p pytube/query.py \ --p pytube/request.py \ --p pytube/streams.py \ --p pytube/version.py \ --p pytube/__init__.py \ --p pytube/__main__.py \ --p pytube/contrib/__init__.py \ --p pytube/contrib/channel.py \ --p pytube/contrib/playlist.py \ --p pytube/contrib/search.py \ No newline at end of file diff --git a/001-Downloader/main.py b/001-Downloader/main.py index f2d7090..ff73936 100644 --- a/001-Downloader/main.py +++ b/001-Downloader/main.py @@ -14,6 +14,7 @@ # 主模块执行 if __name__ == "__main__": path = os.path.dirname(os.path.realpath(sys.argv[0])) + # path = os.path.dirname('/Users/Qincji/Documents/zmt/') app = Ui() app.set_dir(path) # to do diff --git a/001-Downloader/pytube/__init__.py b/001-Downloader/pytube/__init__.py deleted file mode 100755 index 4eaa1b2..0000000 --- a/001-Downloader/pytube/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# flake8: noqa: F401 -# noreorder -""" -Pytube: a very serious Python library for downloading YouTube Videos. -""" -__title__ = "pytube" -__author__ = "Ronnie Ghose, Taylor Fox Dahlin, Nick Ficano" -__license__ = "The Unlicense (Unlicense)" -__js__ = None -__js_url__ = None - -from pytube.version import __version__ -from pytube.streams import Stream -from pytube.captions import Caption -from pytube.query import CaptionQuery, StreamQuery -from pytube.__main__ import YouTube -from pytube.contrib.playlist import Playlist -from pytube.contrib.channel import Channel -from pytube.contrib.search import Search diff --git a/001-Downloader/pytube/__main__.py b/001-Downloader/pytube/__main__.py deleted file mode 100755 index 1879a56..0000000 --- a/001-Downloader/pytube/__main__.py +++ /dev/null @@ -1,482 +0,0 @@ -""" -This module implements the core developer interface for pytube. - -The problem domain of the :class:`YouTube class focuses almost -exclusively on the developer interface. Pytube offloads the heavy lifting to -smaller peripheral modules and functions. - -""" -import logging -from typing import Any, Callable, Dict, List, Optional - -import pytube -import pytube.exceptions as exceptions -from pytube import extract, request -from pytube import Stream, StreamQuery -from pytube.helpers import install_proxy -from pytube.innertube import InnerTube -from pytube.metadata import YouTubeMetadata -from pytube.monostate import Monostate -from downloader import Downloader - -logger = logging.getLogger(__name__) - - -class YouTube(Downloader): - """Core developer interface for pytube.""" - - def __init__( - self, - url: str, - on_progress_callback: Optional[Callable[[Any, bytes, int], None]] = None, - on_complete_callback: Optional[Callable[[Any, Optional[str]], None]] = None, - proxies: Dict[str, str] = None, - use_oauth: bool = False, - allow_oauth_cache: bool = True - ): - Downloader.__init__(self) - """Construct a :class:`YouTube `. - - :param str url: - A valid YouTube watch URL. - :param func on_progress_callback: - (Optional) User defined callback function for stream download - progress events. - :param func on_complete_callback: - (Optional) User defined callback function for stream download - complete events. - - """ - self._js: Optional[str] = None # js fetched by js_url - self._js_url: Optional[str] = None # the url to the js, parsed from watch html - - self._vid_info: Optional[Dict] = None # content fetched from innertube/player - - self._watch_html: Optional[str] = None # the html of /watch?v= - self._embed_html: Optional[str] = None - self._player_config_args: Optional[Dict] = None # inline js in the html containing - self._age_restricted: Optional[bool] = None - - self._fmt_streams: Optional[List[Stream]] = None - - self._initial_data = None - self._metadata: Optional[YouTubeMetadata] = None - - # video_id part of /watch?v= - self.video_id = extract.video_id(url) - - self.watch_url = f"https://youtube.com/watch?v={self.video_id}" - self.embed_url = f"https://www.youtube.com/embed/{self.video_id}" - - # Shared between all instances of `Stream` (Borg pattern). - self.stream_monostate = Monostate( - on_progress=on_progress_callback, on_complete=on_complete_callback - ) - - if proxies: - install_proxy(proxies) - - self._author = None - self._title = None - self._publish_date = None - - self.use_oauth = use_oauth - self.allow_oauth_cache = allow_oauth_cache - - def __repr__(self): - return f'' - - def on_progress_callback(self, stream, chunk, bytes_remaining): - print("进度: %d" % (bytes_remaining)) - Downloader.print_ui(txt="%s-->进度: %d" % (self.title, bytes_remaining)) - - def on_complete_callback(self, any, file_path): - print("%s; " % self.title) - Downloader.print_ui(txt="%s 100%%" % self.title) - Downloader.add_success_count() - - def start(self, url, path): - self.register_on_complete_callback(self.on_complete_callback) - self.register_on_progress_callback(self.on_progress_callback) - Downloader.add_total_count() - Downloader.add_downloading_count() - Downloader.print_ui(txt="开始任务:%s" % url) - try: - # dl = self.streams.first() - # dl = self.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first() - dl = self.streams.filter(progressive=True, file_extension='mp4').order_by('fps').asc().first() - Downloader.print_ui(txt="准备下载:%s" % self.title) - dl.download(output_path=path) - except Exception as e: - Downloader.print_ui(txt="%s 下载失败!" % self.title) - - @property - def watch_html(self): - if self._watch_html: - return self._watch_html - self._watch_html = request.get(url=self.watch_url) - return self._watch_html - - @property - def embed_html(self): - if self._embed_html: - return self._embed_html - self._embed_html = request.get(url=self.embed_url) - return self._embed_html - - @property - def age_restricted(self): - if self._age_restricted: - return self._age_restricted - self._age_restricted = extract.is_age_restricted(self.watch_html) - return self._age_restricted - - @property - def js_url(self): - if self._js_url: - return self._js_url - - if self.age_restricted: - self._js_url = extract.js_url(self.embed_html) - else: - self._js_url = extract.js_url(self.watch_html) - - return self._js_url - - @property - def js(self): - if self._js: - return self._js - - # If the js_url doesn't match the cached url, fetch the new js and update - # the cache; otherwise, load the cache. - if pytube.__js_url__ != self.js_url: - self._js = request.get(self.js_url) - pytube.__js__ = self._js - pytube.__js_url__ = self.js_url - else: - self._js = pytube.__js__ - - return self._js - - @property - def initial_data(self): - if self._initial_data: - return self._initial_data - self._initial_data = extract.initial_data(self.watch_html) - return self._initial_data - - @property - def streaming_data(self): - """Return streamingData from video info.""" - if 'streamingData' in self.vid_info: - return self.vid_info['streamingData'] - else: - self.bypass_age_gate() - return self.vid_info['streamingData'] - - @property - def fmt_streams(self): - """Returns a list of streams if they have been initialized. - - If the streams have not been initialized, finds all relevant - streams and initializes them. - """ - self.check_availability() - if self._fmt_streams: - return self._fmt_streams - - self._fmt_streams = [] - - stream_manifest = extract.apply_descrambler(self.streaming_data) - - # If the cached js doesn't work, try fetching a new js file - # https://github.com/pytube/pytube/issues/1054 - try: - extract.apply_signature(stream_manifest, self.vid_info, self.js) - except exceptions.ExtractError: - # To force an update to the js file, we clear the cache and retry - self._js = None - self._js_url = None - pytube.__js__ = None - pytube.__js_url__ = None - extract.apply_signature(stream_manifest, self.vid_info, self.js) - - # build instances of :class:`Stream ` - # Initialize stream objects - for stream in stream_manifest: - video = Stream( - stream=stream, - monostate=self.stream_monostate, - ) - self._fmt_streams.append(video) - - self.stream_monostate.title = self.title - self.stream_monostate.duration = self.length - - return self._fmt_streams - - def check_availability(self): - """Check whether the video is available. - - Raises different exceptions based on why the video is unavailable, - otherwise does nothing. - """ - status, messages = extract.playability_status(self.watch_html) - - for reason in messages: - if status == 'UNPLAYABLE': - if reason == ( - 'Join this channel to get access to members-only content ' - 'like this video, and other exclusive perks.' - ): - raise exceptions.MembersOnly(video_id=self.video_id) - elif reason == 'This live stream recording is not available.': - raise exceptions.RecordingUnavailable(video_id=self.video_id) - else: - raise exceptions.VideoUnavailable(video_id=self.video_id) - elif status == 'LOGIN_REQUIRED': - if reason == ( - 'This is a private video. ' - 'Please sign in to verify that you may see it.' - ): - raise exceptions.VideoPrivate(video_id=self.video_id) - elif status == 'ERROR': - if reason == 'Video unavailable': - raise exceptions.VideoUnavailable(video_id=self.video_id) - elif status == 'LIVE_STREAM': - raise exceptions.LiveStreamError(video_id=self.video_id) - - @property - def vid_info(self): - """Parse the raw vid info and return the parsed result. - - :rtype: Dict[Any, Any] - """ - if self._vid_info: - return self._vid_info - - innertube = InnerTube(use_oauth=self.use_oauth, allow_cache=self.allow_oauth_cache) - - innertube_response = innertube.player(self.video_id) - self._vid_info = innertube_response - return self._vid_info - - def bypass_age_gate(self): - """Attempt to update the vid_info by bypassing the age gate.""" - innertube = InnerTube( - client='ANDROID_EMBED', - use_oauth=self.use_oauth, - allow_cache=self.allow_oauth_cache - ) - innertube_response = innertube.player(self.video_id) - - playability_status = innertube_response['playabilityStatus'].get('status', None) - - # If we still can't access the video, raise an exception - # (tier 3 age restriction) - if playability_status == 'UNPLAYABLE': - raise exceptions.AgeRestrictedError(self.video_id) - - self._vid_info = innertube_response - - @property - def caption_tracks(self) -> List[pytube.Caption]: - """Get a list of :class:`Caption `. - - :rtype: List[Caption] - """ - raw_tracks = ( - self.vid_info.get("captions", {}) - .get("playerCaptionsTracklistRenderer", {}) - .get("captionTracks", []) - ) - return [pytube.Caption(track) for track in raw_tracks] - - @property - def captions(self) -> pytube.CaptionQuery: - """Interface to query caption tracks. - - :rtype: :class:`CaptionQuery `. - """ - return pytube.CaptionQuery(self.caption_tracks) - - @property - def streams(self) -> StreamQuery: - """Interface to query both adaptive (DASH) and progressive streams. - - :rtype: :class:`StreamQuery `. - """ - self.check_availability() - return StreamQuery(self.fmt_streams) - - @property - def thumbnail_url(self) -> str: - """Get the thumbnail url image. - - :rtype: str - """ - thumbnail_details = ( - self.vid_info.get("videoDetails", {}) - .get("thumbnail", {}) - .get("thumbnails") - ) - if thumbnail_details: - thumbnail_details = thumbnail_details[-1] # last item has max size - return thumbnail_details["url"] - - return f"https://img.youtube.com/vi/{self.video_id}/maxresdefault.jpg" - - @property - def publish_date(self): - """Get the publish date. - - :rtype: datetime - """ - if self._publish_date: - return self._publish_date - self._publish_date = extract.publish_date(self.watch_html) - return self._publish_date - - @publish_date.setter - def publish_date(self, value): - """Sets the publish date.""" - self._publish_date = value - - @property - def title(self) -> str: - """Get the video title. - - :rtype: str - """ - if self._title: - return self._title - - try: - self._title = self.vid_info['videoDetails']['title'] - except KeyError: - # Check_availability will raise the correct exception in most cases - # if it doesn't, ask for a report. - self.check_availability() - raise exceptions.PytubeError( - ( - f'Exception while accessing title of {self.watch_url}. ' - 'Please file a bug report at https://github.com/pytube/pytube' - ) - ) - - return self._title - - @title.setter - def title(self, value): - """Sets the title value.""" - self._title = value - - @property - def description(self) -> str: - """Get the video description. - - :rtype: str - """ - return self.vid_info.get("videoDetails", {}).get("shortDescription") - - @property - def rating(self) -> float: - """Get the video average rating. - - :rtype: float - - """ - return self.vid_info.get("videoDetails", {}).get("averageRating") - - @property - def length(self) -> int: - """Get the video length in seconds. - - :rtype: int - """ - return int(self.vid_info.get('videoDetails', {}).get('lengthSeconds')) - - @property - def views(self) -> int: - """Get the number of the times the video has been viewed. - - :rtype: int - """ - return int(self.vid_info.get("videoDetails", {}).get("viewCount")) - - @property - def author(self) -> str: - """Get the video author. - :rtype: str - """ - if self._author: - return self._author - self._author = self.vid_info.get("videoDetails", {}).get( - "author", "unknown" - ) - return self._author - - @author.setter - def author(self, value): - """Set the video author.""" - self._author = value - - @property - def keywords(self) -> List[str]: - """Get the video keywords. - - :rtype: List[str] - """ - return self.vid_info.get('videoDetails', {}).get('keywords', []) - - @property - def channel_id(self) -> str: - """Get the video poster's channel id. - - :rtype: str - """ - return self.vid_info.get('videoDetails', {}).get('channelId', None) - - @property - def channel_url(self) -> str: - """Construct the channel url for the video's poster from the channel id. - - :rtype: str - """ - return f'https://www.youtube.com/channel/{self.channel_id}' - - @property - def metadata(self) -> Optional[YouTubeMetadata]: - """Get the metadata for the video. - - :rtype: YouTubeMetadata - """ - if self._metadata: - return self._metadata - else: - self._metadata = extract.metadata(self.initial_data) - return self._metadata - - def register_on_progress_callback(self, func: Callable[[Any, bytes, int], None]): - """Register a download progress callback function post initialization. - - :param callable func: - A callback function that takes ``stream``, ``chunk``, - and ``bytes_remaining`` as parameters. - - :rtype: None - - """ - self.stream_monostate.on_progress = func - - def register_on_complete_callback(self, func: Callable[[Any, Optional[str]], None]): - """Register a download complete callback function post initialization. - - :param callable func: - A callback function that takes ``stream`` and ``file_path``. - - :rtype: None - - """ - self.stream_monostate.on_complete = func diff --git a/001-Downloader/pytube/captions.py b/001-Downloader/pytube/captions.py deleted file mode 100755 index ed55f9a..0000000 --- a/001-Downloader/pytube/captions.py +++ /dev/null @@ -1,154 +0,0 @@ -import math -import os -import time -import xml.etree.ElementTree as ElementTree -from html import unescape -from typing import Dict, Optional - -from pytube import request -from pytube.helpers import safe_filename, target_directory - - -class Caption: - """Container for caption tracks.""" - - def __init__(self, caption_track: Dict): - """Construct a :class:`Caption `. - - :param dict caption_track: - Caption track data extracted from ``watch_html``. - """ - self.url = caption_track.get("baseUrl") - - # Certain videos have runs instead of simpleText - # this handles that edge case - name_dict = caption_track['name'] - if 'simpleText' in name_dict: - self.name = name_dict['simpleText'] - else: - for el in name_dict['runs']: - if 'text' in el: - self.name = el['text'] - - # Use "vssId" instead of "languageCode", fix issue #779 - self.code = caption_track["vssId"] - # Remove preceding '.' for backwards compatibility, e.g.: - # English -> vssId: .en, languageCode: en - # English (auto-generated) -> vssId: a.en, languageCode: en - self.code = self.code.strip('.') - - @property - def xml_captions(self) -> str: - """Download the xml caption tracks.""" - return request.get(self.url) - - def generate_srt_captions(self) -> str: - """Generate "SubRip Subtitle" captions. - - Takes the xml captions from :meth:`~pytube.Caption.xml_captions` and - recompiles them into the "SubRip Subtitle" format. - """ - return self.xml_caption_to_srt(self.xml_captions) - - @staticmethod - def float_to_srt_time_format(d: float) -> str: - """Convert decimal durations into proper srt format. - - :rtype: str - :returns: - SubRip Subtitle (str) formatted time duration. - - float_to_srt_time_format(3.89) -> '00:00:03,890' - """ - fraction, whole = math.modf(d) - time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole)) - ms = f"{fraction:.3f}".replace("0.", "") - return time_fmt + ms - - def xml_caption_to_srt(self, xml_captions: str) -> str: - """Convert xml caption tracks to "SubRip Subtitle (srt)". - - :param str xml_captions: - XML formatted caption tracks. - """ - segments = [] - root = ElementTree.fromstring(xml_captions) - for i, child in enumerate(list(root)): - text = child.text or "" - caption = unescape(text.replace("\n", " ").replace(" ", " "),) - try: - duration = float(child.attrib["dur"]) - except KeyError: - duration = 0.0 - start = float(child.attrib["start"]) - end = start + duration - sequence_number = i + 1 # convert from 0-indexed to 1. - line = "{seq}\n{start} --> {end}\n{text}\n".format( - seq=sequence_number, - start=self.float_to_srt_time_format(start), - end=self.float_to_srt_time_format(end), - text=caption, - ) - segments.append(line) - return "\n".join(segments).strip() - - def download( - self, - title: str, - srt: bool = True, - output_path: Optional[str] = None, - filename_prefix: Optional[str] = None, - ) -> str: - """Write the media stream to disk. - - :param title: - Output filename (stem only) for writing media file. - If one is not specified, the default filename is used. - :type title: str - :param srt: - Set to True to download srt, false to download xml. Defaults to True. - :type srt bool - :param output_path: - (optional) Output path for writing media file. If one is not - specified, defaults to the current working directory. - :type output_path: str or None - :param filename_prefix: - (optional) A string that will be prepended to the filename. - For example a number in a playlist or the name of a series. - If one is not specified, nothing will be prepended - This is separate from filename so you can use the default - filename but still add a prefix. - :type filename_prefix: str or None - - :rtype: str - """ - if title.endswith(".srt") or title.endswith(".xml"): - filename = ".".join(title.split(".")[:-1]) - else: - filename = title - - if filename_prefix: - filename = f"{safe_filename(filename_prefix)}{filename}" - - filename = safe_filename(filename) - - filename += f" ({self.code})" - - if srt: - filename += ".srt" - else: - filename += ".xml" - - file_path = os.path.join(target_directory(output_path), filename) - - with open(file_path, "w", encoding="utf-8") as file_handle: - if srt: - file_handle.write(self.generate_srt_captions()) - else: - file_handle.write(self.xml_captions) - - return file_path - - def __repr__(self): - """Printable object representation.""" - return ''.format(s=self) diff --git a/001-Downloader/pytube/cipher.py b/001-Downloader/pytube/cipher.py deleted file mode 100755 index 70ac770..0000000 --- a/001-Downloader/pytube/cipher.py +++ /dev/null @@ -1,679 +0,0 @@ -""" -This module contains all logic necessary to decipher the signature. - -YouTube's strategy to restrict downloading videos is to send a ciphered version -of the signature to the client, along with the decryption algorithm obfuscated -in JavaScript. For the clients to play the videos, JavaScript must take the -ciphered version, cycle it through a series of "transform functions," and then -signs the media URL with the output. - -This module is responsible for (1) finding and extracting those "transform -functions" (2) maps them to Python equivalents and (3) taking the ciphered -signature and decoding it. - -""" -import logging -import re -from itertools import chain -from typing import Any, Callable, Dict, List, Optional, Tuple - -from pytube.exceptions import ExtractError, RegexMatchError -from pytube.helpers import cache, regex_search -from pytube.parser import find_object_from_startpoint, throttling_array_split - -logger = logging.getLogger(__name__) - - -class Cipher: - def __init__(self, js: str): - self.transform_plan: List[str] = get_transform_plan(js) - var_regex = re.compile(r"^\w+\W") - var_match = var_regex.search(self.transform_plan[0]) - if not var_match: - raise RegexMatchError( - caller="__init__", pattern=var_regex.pattern - ) - var = var_match.group(0)[:-1] - self.transform_map = get_transform_map(js, var) - self.js_func_patterns = [ - r"\w+\.(\w+)\(\w,(\d+)\)", - r"\w+\[(\"\w+\")\]\(\w,(\d+)\)" - ] - - self.throttling_plan = get_throttling_plan(js) - self.throttling_array = get_throttling_function_array(js) - - self.calculated_n = None - - def calculate_n(self, initial_n: list): - """Converts n to the correct value to prevent throttling.""" - if self.calculated_n: - return self.calculated_n - - # First, update all instances of 'b' with the list(initial_n) - for i in range(len(self.throttling_array)): - if self.throttling_array[i] == 'b': - self.throttling_array[i] = initial_n - - for step in self.throttling_plan: - curr_func = self.throttling_array[int(step[0])] - if not callable(curr_func): - logger.debug(f'{curr_func} is not callable.') - logger.debug(f'Throttling array:\n{self.throttling_array}\n') - raise ExtractError(f'{curr_func} is not callable.') - - first_arg = self.throttling_array[int(step[1])] - - if len(step) == 2: - curr_func(first_arg) - elif len(step) == 3: - second_arg = self.throttling_array[int(step[2])] - curr_func(first_arg, second_arg) - - self.calculated_n = ''.join(initial_n) - return self.calculated_n - - def get_signature(self, ciphered_signature: str) -> str: - """Decipher the signature. - - Taking the ciphered signature, applies the transform functions. - - :param str ciphered_signature: - The ciphered signature sent in the ``player_config``. - :rtype: str - :returns: - Decrypted signature required to download the media content. - """ - signature = list(ciphered_signature) - - for js_func in self.transform_plan: - name, argument = self.parse_function(js_func) # type: ignore - signature = self.transform_map[name](signature, argument) - logger.debug( - "applied transform function\n" - "output: %s\n" - "js_function: %s\n" - "argument: %d\n" - "function: %s", - "".join(signature), - name, - argument, - self.transform_map[name], - ) - - return "".join(signature) - - @cache - def parse_function(self, js_func: str) -> Tuple[str, int]: - """Parse the Javascript transform function. - - Break a JavaScript transform function down into a two element ``tuple`` - containing the function name and some integer-based argument. - - :param str js_func: - The JavaScript version of the transform function. - :rtype: tuple - :returns: - two element tuple containing the function name and an argument. - - **Example**: - - parse_function('DE.AJ(a,15)') - ('AJ', 15) - - """ - logger.debug("parsing transform function") - for pattern in self.js_func_patterns: - regex = re.compile(pattern) - parse_match = regex.search(js_func) - if parse_match: - fn_name, fn_arg = parse_match.groups() - return fn_name, int(fn_arg) - - raise RegexMatchError( - caller="parse_function", pattern="js_func_patterns" - ) - - -def get_initial_function_name(js: str) -> str: - """Extract the name of the function responsible for computing the signature. - :param str js: - The contents of the base.js asset file. - :rtype: str - :returns: - Function name from regex match - """ - - function_patterns = [ - r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 - r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501 - r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501 - r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', - r"\.sig\|\|(?P[a-zA-Z0-9$]+)\(", - r"yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 - r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 - r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 - r"\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 - r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 - r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 - ] - logger.debug("finding initial function name") - for pattern in function_patterns: - regex = re.compile(pattern) - function_match = regex.search(js) - if function_match: - logger.debug("finished regex search, matched: %s", pattern) - return function_match.group(1) - - raise RegexMatchError( - caller="get_initial_function_name", pattern="multiple" - ) - - -def get_transform_plan(js: str) -> List[str]: - """Extract the "transform plan". - - The "transform plan" is the functions that the ciphered signature is - cycled through to obtain the actual signature. - - :param str js: - The contents of the base.js asset file. - - **Example**: - - ['DE.AJ(a,15)', - 'DE.VR(a,3)', - 'DE.AJ(a,51)', - 'DE.VR(a,3)', - 'DE.kT(a,51)', - 'DE.kT(a,8)', - 'DE.VR(a,3)', - 'DE.kT(a,21)'] - """ - name = re.escape(get_initial_function_name(js)) - pattern = r"%s=function\(\w\){[a-z=\.\(\"\)]*;(.*);(?:.+)}" % name - logger.debug("getting transform plan") - return regex_search(pattern, js, group=1).split(";") - - -def get_transform_object(js: str, var: str) -> List[str]: - """Extract the "transform object". - - The "transform object" contains the function definitions referenced in the - "transform plan". The ``var`` argument is the obfuscated variable name - which contains these functions, for example, given the function call - ``DE.AJ(a,15)`` returned by the transform plan, "DE" would be the var. - - :param str js: - The contents of the base.js asset file. - :param str var: - The obfuscated variable name that stores an object with all functions - that descrambles the signature. - - **Example**: - - >>> get_transform_object(js, 'DE') - ['AJ:function(a){a.reverse()}', - 'VR:function(a,b){a.splice(0,b)}', - 'kT:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}'] - - """ - pattern = r"var %s={(.*?)};" % re.escape(var) - logger.debug("getting transform object") - regex = re.compile(pattern, flags=re.DOTALL) - transform_match = regex.search(js) - if not transform_match: - raise RegexMatchError(caller="get_transform_object", pattern=pattern) - - return transform_match.group(1).replace("\n", " ").split(", ") - - -def get_transform_map(js: str, var: str) -> Dict: - """Build a transform function lookup. - - Build a lookup table of obfuscated JavaScript function names to the - Python equivalents. - - :param str js: - The contents of the base.js asset file. - :param str var: - The obfuscated variable name that stores an object with all functions - that descrambles the signature. - - """ - transform_object = get_transform_object(js, var) - mapper = {} - for obj in transform_object: - # AJ:function(a){a.reverse()} => AJ, function(a){a.reverse()} - name, function = obj.split(":", 1) - fn = map_functions(function) - mapper[name] = fn - return mapper - - -def get_throttling_function_name(js: str) -> str: - """Extract the name of the function that computes the throttling parameter. - - :param str js: - The contents of the base.js asset file. - :rtype: str - :returns: - The name of the function used to compute the throttling parameter. - """ - function_patterns = [ - # https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377 - # a.C&&(b=a.get("n"))&&(b=Dea(b),a.set("n",b))}}; - # In above case, `Dea` is the relevant function name - r'a\.C&&\(b=a\.get\("n"\)\)&&\(b=([^(]+)\(b\),a\.set\("n",b\)\)}};', - ] - logger.debug('Finding throttling function name') - for pattern in function_patterns: - regex = re.compile(pattern) - function_match = regex.search(js) - if function_match: - logger.debug("finished regex search, matched: %s", pattern) - return function_match.group(1) - - raise RegexMatchError( - caller="get_throttling_function_name", pattern="multiple" - ) - - -def get_throttling_function_code(js: str) -> str: - """Extract the raw code for the throttling function. - - :param str js: - The contents of the base.js asset file. - :rtype: str - :returns: - The name of the function used to compute the throttling parameter. - """ - # Begin by extracting the correct function name - name = re.escape(get_throttling_function_name(js)) - - # Identify where the function is defined - pattern_start = r"%s=function\(\w\)" % name - regex = re.compile(pattern_start) - match = regex.search(js) - - # Extract the code within curly braces for the function itself, and merge any split lines - code_lines_list = find_object_from_startpoint(js, match.span()[1]).split('\n') - joined_lines = "".join(code_lines_list) - - # Prepend function definition (e.g. `Dea=function(a)`) - return match.group(0) + joined_lines - - -def get_throttling_function_array(js: str) -> List[Any]: - """Extract the "c" array. - - :param str js: - The contents of the base.js asset file. - :returns: - The array of various integers, arrays, and functions. - """ - raw_code = get_throttling_function_code(js) - - array_start = r",c=\[" - array_regex = re.compile(array_start) - match = array_regex.search(raw_code) - - array_raw = find_object_from_startpoint(raw_code, match.span()[1] - 1) - str_array = throttling_array_split(array_raw) - - converted_array = [] - for el in str_array: - try: - converted_array.append(int(el)) - continue - except ValueError: - # Not an integer value. - pass - - if el == 'null': - converted_array.append(None) - continue - - if el.startswith('"') and el.endswith('"'): - # Convert e.g. '"abcdef"' to string without quotation marks, 'abcdef' - converted_array.append(el[1:-1]) - continue - - if el.startswith('function'): - mapper = ( - (r"{for\(\w=\(\w%\w\.length\+\w\.length\)%\w\.length;\w--;\)\w\.unshift\(\w.pop\(\)\)}", throttling_unshift), # noqa:E501 - (r"{\w\.reverse\(\)}", throttling_reverse), - (r"{\w\.push\(\w\)}", throttling_push), - (r";var\s\w=\w\[0\];\w\[0\]=\w\[\w\];\w\[\w\]=\w}", throttling_swap), - (r"case\s\d+", throttling_cipher_function), - (r"\w\.splice\(0,1,\w\.splice\(\w,1,\w\[0\]\)\[0\]\)", throttling_nested_splice), # noqa:E501 - (r";\w\.splice\(\w,1\)}", js_splice), - (r"\w\.splice\(-\w\)\.reverse\(\)\.forEach\(function\(\w\){\w\.unshift\(\w\)}\)", throttling_prepend), # noqa:E501 - (r"for\(var \w=\w\.length;\w;\)\w\.push\(\w\.splice\(--\w,1\)\[0\]\)}", throttling_reverse), # noqa:E501 - ) - - found = False - for pattern, fn in mapper: - if re.search(pattern, el): - converted_array.append(fn) - found = True - if found: - continue - - converted_array.append(el) - - # Replace null elements with array itself - for i in range(len(converted_array)): - if converted_array[i] is None: - converted_array[i] = converted_array - - return converted_array - - -def get_throttling_plan(js: str): - """Extract the "throttling plan". - - The "throttling plan" is a list of tuples used for calling functions - in the c array. The first element of the tuple is the index of the - function to call, and any remaining elements of the tuple are arguments - to pass to that function. - - :param str js: - The contents of the base.js asset file. - :returns: - The full function code for computing the throttlign parameter. - """ - raw_code = get_throttling_function_code(js) - - transform_start = r"try{" - plan_regex = re.compile(transform_start) - match = plan_regex.search(raw_code) - - transform_plan_raw = find_object_from_startpoint(raw_code, match.span()[1] - 1) - - # Steps are either c[x](c[y]) or c[x](c[y],c[z]) - step_start = r"c\[(\d+)\]\(c\[(\d+)\](,c(\[(\d+)\]))?\)" - step_regex = re.compile(step_start) - matches = step_regex.findall(transform_plan_raw) - transform_steps = [] - for match in matches: - if match[4] != '': - transform_steps.append((match[0],match[1],match[4])) - else: - transform_steps.append((match[0],match[1])) - - return transform_steps - - -def reverse(arr: List, _: Optional[Any]): - """Reverse elements in a list. - - This function is equivalent to: - - .. code-block:: javascript - - function(a, b) { a.reverse() } - - This method takes an unused ``b`` variable as their transform functions - universally sent two arguments. - - **Example**: - - >>> reverse([1, 2, 3, 4]) - [4, 3, 2, 1] - """ - return arr[::-1] - - -def splice(arr: List, b: int): - """Add/remove items to/from a list. - - This function is equivalent to: - - .. code-block:: javascript - - function(a, b) { a.splice(0, b) } - - **Example**: - - >>> splice([1, 2, 3, 4], 2) - [1, 2] - """ - return arr[b:] - - -def swap(arr: List, b: int): - """Swap positions at b modulus the list length. - - This function is equivalent to: - - .. code-block:: javascript - - function(a, b) { var c=a[0];a[0]=a[b%a.length];a[b]=c } - - **Example**: - - >>> swap([1, 2, 3, 4], 2) - [3, 2, 1, 4] - """ - r = b % len(arr) - return list(chain([arr[r]], arr[1:r], [arr[0]], arr[r + 1 :])) - - -def throttling_reverse(arr: list): - """Reverses the input list. - - Needs to do an in-place reversal so that the passed list gets changed. - To accomplish this, we create a reversed copy, and then change each - indvidual element. - """ - reverse_copy = arr.copy()[::-1] - for i in range(len(reverse_copy)): - arr[i] = reverse_copy[i] - - -def throttling_push(d: list, e: Any): - """Pushes an element onto a list.""" - d.append(e) - - -def throttling_mod_func(d: list, e: int): - """Perform the modular function from the throttling array functions. - - In the javascript, the modular operation is as follows: - e = (e % d.length + d.length) % d.length - - We simply translate this to python here. - """ - return (e % len(d) + len(d)) % len(d) - - -def throttling_unshift(d: list, e: int): - """Rotates the elements of the list to the right. - - In the javascript, the operation is as follows: - for(e=(e%d.length+d.length)%d.length;e--;)d.unshift(d.pop()) - """ - e = throttling_mod_func(d, e) - new_arr = d[-e:] + d[:-e] - d.clear() - for el in new_arr: - d.append(el) - - -def throttling_cipher_function(d: list, e: str): - """This ciphers d with e to generate a new list. - - In the javascript, the operation is as follows: - var h = [A-Za-z0-9-_], f = 96; // simplified from switch-case loop - d.forEach( - function(l,m,n){ - this.push( - n[m]=h[ - (h.indexOf(l)-h.indexOf(this[m])+m-32+f--)%h.length - ] - ) - }, - e.split("") - ) - """ - h = list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_') - f = 96 - # by naming it "this" we can more closely reflect the js - this = list(e) - - # This is so we don't run into weirdness with enumerate while - # we change the input list - copied_list = d.copy() - - for m, l in enumerate(copied_list): - bracket_val = (h.index(l) - h.index(this[m]) + m - 32 + f) % len(h) - this.append( - h[bracket_val] - ) - d[m] = h[bracket_val] - f -= 1 - - -def throttling_nested_splice(d: list, e: int): - """Nested splice function in throttling js. - - In the javascript, the operation is as follows: - function(d,e){ - e=(e%d.length+d.length)%d.length; - d.splice( - 0, - 1, - d.splice( - e, - 1, - d[0] - )[0] - ) - } - - While testing, all this seemed to do is swap element 0 and e, - but the actual process is preserved in case there was an edge - case that was not considered. - """ - e = throttling_mod_func(d, e) - inner_splice = js_splice( - d, - e, - 1, - d[0] - ) - js_splice( - d, - 0, - 1, - inner_splice[0] - ) - - -def throttling_prepend(d: list, e: int): - """ - - In the javascript, the operation is as follows: - function(d,e){ - e=(e%d.length+d.length)%d.length; - d.splice(-e).reverse().forEach( - function(f){ - d.unshift(f) - } - ) - } - - Effectively, this moves the last e elements of d to the beginning. - """ - start_len = len(d) - # First, calculate e - e = throttling_mod_func(d, e) - - # Then do the prepending - new_arr = d[-e:] + d[:-e] - - # And update the input list - d.clear() - for el in new_arr: - d.append(el) - - end_len = len(d) - assert start_len == end_len - - -def throttling_swap(d: list, e: int): - """Swap positions of the 0'th and e'th elements in-place.""" - e = throttling_mod_func(d, e) - f = d[0] - d[0] = d[e] - d[e] = f - - -def js_splice(arr: list, start: int, delete_count=None, *items): - """Implementation of javascript's splice function. - - :param list arr: - Array to splice - :param int start: - Index at which to start changing the array - :param int delete_count: - Number of elements to delete from the array - :param *items: - Items to add to the array - - Reference: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/splice # noqa:E501 - """ - # Special conditions for start value - try: - if start > len(arr): - start = len(arr) - # If start is negative, count backwards from end - if start < 0: - start = len(arr) - start - except TypeError: - # Non-integer start values are treated as 0 in js - start = 0 - - # Special condition when delete_count is greater than remaining elements - if not delete_count or delete_count >= len(arr) - start: - delete_count = len(arr) - start # noqa: N806 - - deleted_elements = arr[start:start + delete_count] - - # Splice appropriately. - new_arr = arr[:start] + list(items) + arr[start + delete_count:] - - # Replace contents of input array - arr.clear() - for el in new_arr: - arr.append(el) - - return deleted_elements - - -def map_functions(js_func: str) -> Callable: - """For a given JavaScript transform function, return the Python equivalent. - - :param str js_func: - The JavaScript version of the transform function. - """ - mapper = ( - # function(a){a.reverse()} - (r"{\w\.reverse\(\)}", reverse), - # function(a,b){a.splice(0,b)} - (r"{\w\.splice\(0,\w\)}", splice), - # function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c} - (r"{var\s\w=\w\[0\];\w\[0\]=\w\[\w\%\w.length\];\w\[\w\]=\w}", swap), - # function(a,b){var c=a[0];a[0]=a[b%a.length];a[b%a.length]=c} - ( - r"{var\s\w=\w\[0\];\w\[0\]=\w\[\w\%\w.length\];\w\[\w\%\w.length\]=\w}", - swap, - ), - ) - - for pattern, fn in mapper: - if re.search(pattern, js_func): - return fn - raise RegexMatchError(caller="map_functions", pattern="multiple") diff --git a/001-Downloader/pytube/cli.py b/001-Downloader/pytube/cli.py deleted file mode 100755 index 7a98854..0000000 --- a/001-Downloader/pytube/cli.py +++ /dev/null @@ -1,560 +0,0 @@ -#!/usr/bin/env python3 -"""A simple command line application to download youtube videos.""" -import argparse -import gzip -import json -import logging -import os -import shutil -import sys -import datetime as dt -import subprocess # nosec -from typing import List, Optional - -import pytube.exceptions as exceptions -from pytube import __version__ -from pytube import CaptionQuery, Playlist, Stream, YouTube -from pytube.helpers import safe_filename, setup_logger - - -logger = logging.getLogger(__name__) - - -def main(): - """Command line application to download youtube videos.""" - # noinspection PyTypeChecker - parser = argparse.ArgumentParser(description=main.__doc__) - args = _parse_args(parser) - if args.verbose: - log_filename = None - if args.logfile: - log_filename = args.logfile - setup_logger(logging.DEBUG, log_filename=log_filename) - logger.debug(f'Pytube version: {__version__}') - - if not args.url or "youtu" not in args.url: - parser.print_help() - sys.exit(1) - - if "/playlist" in args.url: - print("Loading playlist...") - playlist = Playlist(args.url) - if not args.target: - args.target = safe_filename(playlist.title) - for youtube_video in playlist.videos: - try: - _perform_args_on_youtube(youtube_video, args) - except exceptions.PytubeError as e: - print(f"There was an error with video: {youtube_video}") - print(e) - else: - print("Loading video...") - youtube = YouTube(args.url) - _perform_args_on_youtube(youtube, args) - - -def _perform_args_on_youtube( - youtube: YouTube, args: argparse.Namespace -) -> None: - if len(sys.argv) == 2 : # no arguments parsed - download_highest_resolution_progressive( - youtube=youtube, resolution="highest", target=args.target - ) - if args.list_captions: - _print_available_captions(youtube.captions) - if args.list: - display_streams(youtube) - if args.build_playback_report: - build_playback_report(youtube) - if args.itag: - download_by_itag(youtube=youtube, itag=args.itag, target=args.target) - if args.caption_code: - download_caption( - youtube=youtube, lang_code=args.caption_code, target=args.target - ) - if args.resolution: - download_by_resolution( - youtube=youtube, resolution=args.resolution, target=args.target - ) - if args.audio: - download_audio( - youtube=youtube, filetype=args.audio, target=args.target - ) - if args.ffmpeg: - ffmpeg_process( - youtube=youtube, resolution=args.ffmpeg, target=args.target - ) - - -def _parse_args( - parser: argparse.ArgumentParser, args: Optional[List] = None -) -> argparse.Namespace: - parser.add_argument( - "url", help="The YouTube /watch or /playlist url", nargs="?" - ) - parser.add_argument( - "--version", action="version", version="%(prog)s " + __version__, - ) - parser.add_argument( - "--itag", type=int, help="The itag for the desired stream", - ) - parser.add_argument( - "-r", - "--resolution", - type=str, - help="The resolution for the desired stream", - ) - parser.add_argument( - "-l", - "--list", - action="store_true", - help=( - "The list option causes pytube cli to return a list of streams " - "available to download" - ), - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - dest="verbose", - help="Set logger output to verbose output.", - ) - parser.add_argument( - "--logfile", - action="store", - help="logging debug and error messages into a log file", - ) - parser.add_argument( - "--build-playback-report", - action="store_true", - help="Save the html and js to disk", - ) - parser.add_argument( - "-c", - "--caption-code", - type=str, - help=( - "Download srt captions for given language code. " - "Prints available language codes if no argument given" - ), - ) - parser.add_argument( - '-lc', - '--list-captions', - action='store_true', - help=( - "List available caption codes for a video" - ) - ) - parser.add_argument( - "-t", - "--target", - help=( - "The output directory for the downloaded stream. " - "Default is current working directory" - ), - ) - parser.add_argument( - "-a", - "--audio", - const="mp4", - nargs="?", - help=( - "Download the audio for a given URL at the highest bitrate available" - "Defaults to mp4 format if none is specified" - ), - ) - parser.add_argument( - "-f", - "--ffmpeg", - const="best", - nargs="?", - help=( - "Downloads the audio and video stream for resolution provided" - "If no resolution is provided, downloads the best resolution" - "Runs the command line program ffmpeg to combine the audio and video" - ), - ) - - return parser.parse_args(args) - - -def build_playback_report(youtube: YouTube) -> None: - """Serialize the request data to json for offline debugging. - - :param YouTube youtube: - A YouTube object. - """ - ts = int(dt.datetime.utcnow().timestamp()) - fp = os.path.join(os.getcwd(), f"yt-video-{youtube.video_id}-{ts}.json.gz") - - js = youtube.js - watch_html = youtube.watch_html - vid_info = youtube.vid_info - - with gzip.open(fp, "wb") as fh: - fh.write( - json.dumps( - { - "url": youtube.watch_url, - "js": js, - "watch_html": watch_html, - "video_info": vid_info, - } - ).encode("utf8"), - ) - - -def display_progress_bar( - bytes_received: int, filesize: int, ch: str = "█", scale: float = 0.55 -) -> None: - """Display a simple, pretty progress bar. - - Example: - ~~~~~~~~ - PSY - GANGNAM STYLE(강남스타일) MV.mp4 - ↳ |███████████████████████████████████████| 100.0% - - :param int bytes_received: - The delta between the total file size (bytes) and bytes already - written to disk. - :param int filesize: - File size of the media stream in bytes. - :param str ch: - Character to use for presenting progress segment. - :param float scale: - Scale multiplier to reduce progress bar size. - - """ - columns = shutil.get_terminal_size().columns - max_width = int(columns * scale) - - filled = int(round(max_width * bytes_received / float(filesize))) - remaining = max_width - filled - progress_bar = ch * filled + " " * remaining - percent = round(100.0 * bytes_received / float(filesize), 1) - text = f" ↳ |{progress_bar}| {percent}%\r" - sys.stdout.write(text) - sys.stdout.flush() - - -# noinspection PyUnusedLocal -def on_progress( - stream: Stream, chunk: bytes, bytes_remaining: int -) -> None: # pylint: disable=W0613 - filesize = stream.filesize - bytes_received = filesize - bytes_remaining - display_progress_bar(bytes_received, filesize) - - -def _download( - stream: Stream, - target: Optional[str] = None, - filename: Optional[str] = None, -) -> None: - filesize_megabytes = stream.filesize // 1048576 - print(f"{filename or stream.default_filename} | {filesize_megabytes} MB") - file_path = stream.get_file_path(filename=filename, output_path=target) - if stream.exists_at_path(file_path): - print(f"Already downloaded at:\n{file_path}") - return - - stream.download(output_path=target, filename=filename) - sys.stdout.write("\n") - - -def _unique_name(base: str, subtype: str, media_type: str, target: str) -> str: - """ - Given a base name, the file format, and the target directory, will generate - a filename unique for that directory and file format. - :param str base: - The given base-name. - :param str subtype: - The filetype of the video which will be downloaded. - :param str media_type: - The media_type of the file, ie. "audio" or "video" - :param Path target: - Target directory for download. - """ - counter = 0 - while True: - file_name = f"{base}_{media_type}_{counter}" - file_path = os.path.join(target, f"{file_name}.{subtype}") - if not os.path.exists(file_path): - return file_name - counter += 1 - - -def ffmpeg_process( - youtube: YouTube, resolution: str, target: Optional[str] = None -) -> None: - """ - Decides the correct video stream to download, then calls _ffmpeg_downloader. - - :param YouTube youtube: - A valid YouTube object. - :param str resolution: - YouTube video resolution. - :param str target: - Target directory for download - """ - youtube.register_on_progress_callback(on_progress) - target = target or os.getcwd() - - if resolution == "best": - highest_quality_stream = ( - youtube.streams.filter(progressive=False) - .order_by("resolution") - .last() - ) - mp4_stream = ( - youtube.streams.filter(progressive=False, subtype="mp4") - .order_by("resolution") - .last() - ) - if highest_quality_stream.resolution == mp4_stream.resolution: - video_stream = mp4_stream - else: - video_stream = highest_quality_stream - else: - video_stream = youtube.streams.filter( - progressive=False, resolution=resolution, subtype="mp4" - ).first() - if not video_stream: - video_stream = youtube.streams.filter( - progressive=False, resolution=resolution - ).first() - if video_stream is None: - print(f"Could not find a stream with resolution: {resolution}") - print("Try one of these:") - display_streams(youtube) - sys.exit() - - audio_stream = youtube.streams.get_audio_only(video_stream.subtype) - if not audio_stream: - audio_stream = ( - youtube.streams.filter(only_audio=True).order_by("abr").last() - ) - if not audio_stream: - print("Could not find an audio only stream") - sys.exit() - _ffmpeg_downloader( - audio_stream=audio_stream, video_stream=video_stream, target=target - ) - - -def _ffmpeg_downloader( - audio_stream: Stream, video_stream: Stream, target: str -) -> None: - """ - Given a YouTube Stream object, finds the correct audio stream, downloads them both - giving them a unique name, them uses ffmpeg to create a new file with the audio - and video from the previously downloaded files. Then deletes the original adaptive - streams, leaving the combination. - - :param Stream audio_stream: - A valid Stream object representing the audio to download - :param Stream video_stream: - A valid Stream object representing the video to download - :param Path target: - A valid Path object - """ - video_unique_name = _unique_name( - safe_filename(video_stream.title), - video_stream.subtype, - "video", - target=target, - ) - audio_unique_name = _unique_name( - safe_filename(video_stream.title), - audio_stream.subtype, - "audio", - target=target, - ) - _download(stream=video_stream, target=target, filename=video_unique_name) - print("Loading audio...") - _download(stream=audio_stream, target=target, filename=audio_unique_name) - - video_path = os.path.join( - target, f"{video_unique_name}.{video_stream.subtype}" - ) - audio_path = os.path.join( - target, f"{audio_unique_name}.{audio_stream.subtype}" - ) - final_path = os.path.join( - target, f"{safe_filename(video_stream.title)}.{video_stream.subtype}" - ) - - subprocess.run( # nosec - [ - "ffmpeg", - "-i", - video_path, - "-i", - audio_path, - "-codec", - "copy", - final_path, - ] - ) - os.unlink(video_path) - os.unlink(audio_path) - - -def download_by_itag( - youtube: YouTube, itag: int, target: Optional[str] = None -) -> None: - """Start downloading a YouTube video. - - :param YouTube youtube: - A valid YouTube object. - :param int itag: - YouTube format identifier code. - :param str target: - Target directory for download - """ - stream = youtube.streams.get_by_itag(itag) - if stream is None: - print(f"Could not find a stream with itag: {itag}") - print("Try one of these:") - display_streams(youtube) - sys.exit() - - youtube.register_on_progress_callback(on_progress) - - try: - _download(stream, target=target) - except KeyboardInterrupt: - sys.exit() - - -def download_by_resolution( - youtube: YouTube, resolution: str, target: Optional[str] = None -) -> None: - """Start downloading a YouTube video. - - :param YouTube youtube: - A valid YouTube object. - :param str resolution: - YouTube video resolution. - :param str target: - Target directory for download - """ - # TODO(nficano): allow dash itags to be selected - stream = youtube.streams.get_by_resolution(resolution) - if stream is None: - print(f"Could not find a stream with resolution: {resolution}") - print("Try one of these:") - display_streams(youtube) - sys.exit() - - youtube.register_on_progress_callback(on_progress) - - try: - _download(stream, target=target) - except KeyboardInterrupt: - sys.exit() - - -def download_highest_resolution_progressive( - youtube: YouTube, resolution: str, target: Optional[str] = None -) -> None: - """Start downloading the highest resolution progressive stream. - - :param YouTube youtube: - A valid YouTube object. - :param str resolution: - YouTube video resolution. - :param str target: - Target directory for download - """ - youtube.register_on_progress_callback(on_progress) - try: - stream = youtube.streams.get_highest_resolution() - except exceptions.VideoUnavailable as err: - print(f"No video streams available: {err}") - else: - try: - _download(stream, target=target) - except KeyboardInterrupt: - sys.exit() - - -def display_streams(youtube: YouTube) -> None: - """Probe YouTube video and lists its available formats. - - :param YouTube youtube: - A valid YouTube watch URL. - - """ - for stream in youtube.streams: - print(stream) - - -def _print_available_captions(captions: CaptionQuery) -> None: - print( - f"Available caption codes are: {', '.join(c.code for c in captions)}" - ) - - -def download_caption( - youtube: YouTube, lang_code: Optional[str], target: Optional[str] = None -) -> None: - """Download a caption for the YouTube video. - - :param YouTube youtube: - A valid YouTube object. - :param str lang_code: - Language code desired for caption file. - Prints available codes if the value is None - or the desired code is not available. - :param str target: - Target directory for download - """ - try: - caption = youtube.captions[lang_code] - downloaded_path = caption.download( - title=youtube.title, output_path=target - ) - print(f"Saved caption file to: {downloaded_path}") - except KeyError: - print(f"Unable to find caption with code: {lang_code}") - _print_available_captions(youtube.captions) - - -def download_audio( - youtube: YouTube, filetype: str, target: Optional[str] = None -) -> None: - """ - Given a filetype, downloads the highest quality available audio stream for a - YouTube video. - - :param YouTube youtube: - A valid YouTube object. - :param str filetype: - Desired file format to download. - :param str target: - Target directory for download - """ - audio = ( - youtube.streams.filter(only_audio=True, subtype=filetype) - .order_by("abr") - .last() - ) - - if audio is None: - print("No audio only stream found. Try one of these:") - display_streams(youtube) - sys.exit() - - youtube.register_on_progress_callback(on_progress) - - try: - _download(audio, target=target) - except KeyboardInterrupt: - sys.exit() - - -if __name__ == "__main__": - main() diff --git a/001-Downloader/pytube/contrib/channel.py b/001-Downloader/pytube/contrib/channel.py deleted file mode 100755 index 147ff7e..0000000 --- a/001-Downloader/pytube/contrib/channel.py +++ /dev/null @@ -1,201 +0,0 @@ -# -*- coding: utf-8 -*- -"""Module for interacting with a user's youtube channel.""" -import json -import logging -from typing import Dict, List, Optional, Tuple - -from pytube import extract, Playlist, request -from pytube.helpers import uniqueify - -logger = logging.getLogger(__name__) - - -class Channel(Playlist): - def __init__(self, url: str, proxies: Optional[Dict[str, str]] = None): - """Construct a :class:`Channel `. - - :param str url: - A valid YouTube channel URL. - :param proxies: - (Optional) A dictionary of proxies to use for web requests. - """ - super().__init__(url, proxies) - - self.channel_uri = extract.channel_name(url) - - self.channel_url = ( - f"https://www.youtube.com{self.channel_uri}" - ) - - self.videos_url = self.channel_url + '/videos' - self.playlists_url = self.channel_url + '/playlists' - self.community_url = self.channel_url + '/community' - self.featured_channels_url = self.channel_url + '/channels' - self.about_url = self.channel_url + '/about' - - # Possible future additions - self._playlists_html = None - self._community_html = None - self._featured_channels_html = None - self._about_html = None - - @property - def channel_name(self): - """Get the name of the YouTube channel. - - :rtype: str - """ - return self.initial_data['metadata']['channelMetadataRenderer']['title'] - - @property - def channel_id(self): - """Get the ID of the YouTube channel. - - This will return the underlying ID, not the vanity URL. - - :rtype: str - """ - return self.initial_data['metadata']['channelMetadataRenderer']['externalId'] - - @property - def vanity_url(self): - """Get the vanity URL of the YouTube channel. - - Returns None if it doesn't exist. - - :rtype: str - """ - return self.initial_data['metadata']['channelMetadataRenderer'].get('vanityChannelUrl', None) # noqa:E501 - - @property - def html(self): - """Get the html for the /videos page. - - :rtype: str - """ - if self._html: - return self._html - self._html = request.get(self.videos_url) - return self._html - - @property - def playlists_html(self): - """Get the html for the /playlists page. - - Currently unused for any functionality. - - :rtype: str - """ - if self._playlists_html: - return self._playlists_html - else: - self._playlists_html = request.get(self.playlists_url) - return self._playlists_html - - @property - def community_html(self): - """Get the html for the /community page. - - Currently unused for any functionality. - - :rtype: str - """ - if self._community_html: - return self._community_html - else: - self._community_html = request.get(self.community_url) - return self._community_html - - @property - def featured_channels_html(self): - """Get the html for the /channels page. - - Currently unused for any functionality. - - :rtype: str - """ - if self._featured_channels_html: - return self._featured_channels_html - else: - self._featured_channels_html = request.get(self.featured_channels_url) - return self._featured_channels_html - - @property - def about_html(self): - """Get the html for the /about page. - - Currently unused for any functionality. - - :rtype: str - """ - if self._about_html: - return self._about_html - else: - self._about_html = request.get(self.about_url) - return self._about_html - - @staticmethod - def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]: - """Extracts videos from a raw json page - - :param str raw_json: Input json extracted from the page or the last - server response - :rtype: Tuple[List[str], Optional[str]] - :returns: Tuple containing a list of up to 100 video watch ids and - a continuation token, if more videos are available - """ - initial_data = json.loads(raw_json) - # this is the json tree structure, if the json was extracted from - # html - try: - videos = initial_data["contents"][ - "twoColumnBrowseResultsRenderer"][ - "tabs"][1]["tabRenderer"]["content"][ - "sectionListRenderer"]["contents"][0][ - "itemSectionRenderer"]["contents"][0][ - "gridRenderer"]["items"] - except (KeyError, IndexError, TypeError): - try: - # this is the json tree structure, if the json was directly sent - # by the server in a continuation response - important_content = initial_data[1]['response']['onResponseReceivedActions'][ - 0 - ]['appendContinuationItemsAction']['continuationItems'] - videos = important_content - except (KeyError, IndexError, TypeError): - try: - # this is the json tree structure, if the json was directly sent - # by the server in a continuation response - # no longer a list and no longer has the "response" key - important_content = initial_data['onResponseReceivedActions'][0][ - 'appendContinuationItemsAction']['continuationItems'] - videos = important_content - except (KeyError, IndexError, TypeError) as p: - logger.info(p) - return [], None - - try: - continuation = videos[-1]['continuationItemRenderer'][ - 'continuationEndpoint' - ]['continuationCommand']['token'] - videos = videos[:-1] - except (KeyError, IndexError): - # if there is an error, no continuation is available - continuation = None - - # remove duplicates - return ( - uniqueify( - list( - # only extract the video ids from the video data - map( - lambda x: ( - f"/watch?v=" - f"{x['gridVideoRenderer']['videoId']}" - ), - videos - ) - ), - ), - continuation, - ) diff --git a/001-Downloader/pytube/contrib/playlist.py b/001-Downloader/pytube/contrib/playlist.py deleted file mode 100755 index db9c718..0000000 --- a/001-Downloader/pytube/contrib/playlist.py +++ /dev/null @@ -1,411 +0,0 @@ -"""Module to download a complete playlist from a youtube channel.""" -import json -import logging -from collections.abc import Sequence -from datetime import date, datetime -from typing import Dict, Iterable, List, Optional, Tuple, Union - -from pytube import extract, request, YouTube -from pytube.helpers import cache, DeferredGeneratorList, install_proxy, uniqueify - -logger = logging.getLogger(__name__) - - -class Playlist(Sequence): - """Load a YouTube playlist with URL""" - - def __init__(self, url: str, proxies: Optional[Dict[str, str]] = None): - if proxies: - install_proxy(proxies) - - self._input_url = url - - # These need to be initialized as None for the properties. - self._html = None - self._ytcfg = None - self._initial_data = None - self._sidebar_info = None - - self._playlist_id = None - - @property - def playlist_id(self): - """Get the playlist id. - - :rtype: str - """ - if self._playlist_id: - return self._playlist_id - self._playlist_id = extract.playlist_id(self._input_url) - return self._playlist_id - - @property - def playlist_url(self): - """Get the base playlist url. - - :rtype: str - """ - return f"https://www.youtube.com/playlist?list={self.playlist_id}" - - @property - def html(self): - """Get the playlist page html. - - :rtype: str - """ - if self._html: - return self._html - self._html = request.get(self.playlist_url) - return self._html - - @property - def ytcfg(self): - """Extract the ytcfg from the playlist page html. - - :rtype: dict - """ - if self._ytcfg: - return self._ytcfg - self._ytcfg = extract.get_ytcfg(self.html) - return self._ytcfg - - @property - def initial_data(self): - """Extract the initial data from the playlist page html. - - :rtype: dict - """ - if self._initial_data: - return self._initial_data - else: - self._initial_data = extract.initial_data(self.html) - return self._initial_data - - @property - def sidebar_info(self): - """Extract the sidebar info from the playlist page html. - - :rtype: dict - """ - if self._sidebar_info: - return self._sidebar_info - else: - self._sidebar_info = self.initial_data['sidebar'][ - 'playlistSidebarRenderer']['items'] - return self._sidebar_info - - @property - def yt_api_key(self): - """Extract the INNERTUBE_API_KEY from the playlist ytcfg. - - :rtype: str - """ - return self.ytcfg['INNERTUBE_API_KEY'] - - def _paginate( - self, until_watch_id: Optional[str] = None - ) -> Iterable[List[str]]: - """Parse the video links from the page source, yields the /watch?v= - part from video link - - :param until_watch_id Optional[str]: YouTube Video watch id until - which the playlist should be read. - - :rtype: Iterable[List[str]] - :returns: Iterable of lists of YouTube watch ids - """ - videos_urls, continuation = self._extract_videos( - json.dumps(extract.initial_data(self.html)) - ) - if until_watch_id: - try: - trim_index = videos_urls.index(f"/watch?v={until_watch_id}") - yield videos_urls[:trim_index] - return - except ValueError: - pass - yield videos_urls - - # Extraction from a playlist only returns 100 videos at a time - # if self._extract_videos returns a continuation there are more - # than 100 songs inside a playlist, so we need to add further requests - # to gather all of them - if continuation: - load_more_url, headers, data = self._build_continuation_url(continuation) - else: - load_more_url, headers, data = None, None, None - - while load_more_url and headers and data: # there is an url found - logger.debug("load more url: %s", load_more_url) - # requesting the next page of videos with the url generated from the - # previous page, needs to be a post - req = request.post(load_more_url, extra_headers=headers, data=data) - # extract up to 100 songs from the page loaded - # returns another continuation if more videos are available - videos_urls, continuation = self._extract_videos(req) - if until_watch_id: - try: - trim_index = videos_urls.index(f"/watch?v={until_watch_id}") - yield videos_urls[:trim_index] - return - except ValueError: - pass - yield videos_urls - - if continuation: - load_more_url, headers, data = self._build_continuation_url( - continuation - ) - else: - load_more_url, headers, data = None, None, None - - def _build_continuation_url(self, continuation: str) -> Tuple[str, dict, dict]: - """Helper method to build the url and headers required to request - the next page of videos - - :param str continuation: Continuation extracted from the json response - of the last page - :rtype: Tuple[str, dict, dict] - :returns: Tuple of an url and required headers for the next http - request - """ - return ( - ( - # was changed to this format (and post requests) - # between 2021.03.02 and 2021.03.03 - "https://www.youtube.com/youtubei/v1/browse?key=" - f"{self.yt_api_key}" - ), - { - "X-YouTube-Client-Name": "1", - "X-YouTube-Client-Version": "2.20200720.00.02", - }, - # extra data required for post request - { - "continuation": continuation, - "context": { - "client": { - "clientName": "WEB", - "clientVersion": "2.20200720.00.02" - } - } - } - ) - - @staticmethod - def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]: - """Extracts videos from a raw json page - - :param str raw_json: Input json extracted from the page or the last - server response - :rtype: Tuple[List[str], Optional[str]] - :returns: Tuple containing a list of up to 100 video watch ids and - a continuation token, if more videos are available - """ - initial_data = json.loads(raw_json) - try: - # this is the json tree structure, if the json was extracted from - # html - section_contents = initial_data["contents"][ - "twoColumnBrowseResultsRenderer"][ - "tabs"][0]["tabRenderer"]["content"][ - "sectionListRenderer"]["contents"] - try: - # Playlist without submenus - important_content = section_contents[ - 0]["itemSectionRenderer"][ - "contents"][0]["playlistVideoListRenderer"] - except (KeyError, IndexError, TypeError): - # Playlist with submenus - important_content = section_contents[ - 1]["itemSectionRenderer"][ - "contents"][0]["playlistVideoListRenderer"] - videos = important_content["contents"] - except (KeyError, IndexError, TypeError): - try: - # this is the json tree structure, if the json was directly sent - # by the server in a continuation response - # no longer a list and no longer has the "response" key - important_content = initial_data['onResponseReceivedActions'][0][ - 'appendContinuationItemsAction']['continuationItems'] - videos = important_content - except (KeyError, IndexError, TypeError) as p: - logger.info(p) - return [], None - - try: - continuation = videos[-1]['continuationItemRenderer'][ - 'continuationEndpoint' - ]['continuationCommand']['token'] - videos = videos[:-1] - except (KeyError, IndexError): - # if there is an error, no continuation is available - continuation = None - - # remove duplicates - return ( - uniqueify( - list( - # only extract the video ids from the video data - map( - lambda x: ( - f"/watch?v=" - f"{x['playlistVideoRenderer']['videoId']}" - ), - videos - ) - ), - ), - continuation, - ) - - def trimmed(self, video_id: str) -> Iterable[str]: - """Retrieve a list of YouTube video URLs trimmed at the given video ID - - i.e. if the playlist has video IDs 1,2,3,4 calling trimmed(3) returns - [1,2] - :type video_id: str - video ID to trim the returned list of playlist URLs at - :rtype: List[str] - :returns: - List of video URLs from the playlist trimmed at the given ID - """ - for page in self._paginate(until_watch_id=video_id): - yield from (self._video_url(watch_path) for watch_path in page) - - def url_generator(self): - """Generator that yields video URLs. - - :Yields: Video URLs - """ - for page in self._paginate(): - for video in page: - yield self._video_url(video) - - @property # type: ignore - @cache - def video_urls(self) -> DeferredGeneratorList: - """Complete links of all the videos in playlist - - :rtype: List[str] - :returns: List of video URLs - """ - return DeferredGeneratorList(self.url_generator()) - - def videos_generator(self): - for url in self.video_urls: - yield YouTube(url) - - @property - def videos(self) -> Iterable[YouTube]: - """Yields YouTube objects of videos in this playlist - - :rtype: List[YouTube] - :returns: List of YouTube - """ - return DeferredGeneratorList(self.videos_generator()) - - def __getitem__(self, i: Union[slice, int]) -> Union[str, List[str]]: - return self.video_urls[i] - - def __len__(self) -> int: - return len(self.video_urls) - - def __repr__(self) -> str: - return f"{repr(self.video_urls)}" - - @property - @cache - def last_updated(self) -> Optional[date]: - """Extract the date that the playlist was last updated. - - :return: Date of last playlist update - :rtype: datetime.date - """ - last_updated_text = self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][ - 'stats'][2]['runs'][1]['text'] - date_components = last_updated_text.split() - month = date_components[0] - day = date_components[1].strip(',') - year = date_components[2] - return datetime.strptime( - f"{month} {day:0>2} {year}", "%b %d %Y" - ).date() - - @property - @cache - def title(self) -> Optional[str]: - """Extract playlist title - - :return: playlist title (name) - :rtype: Optional[str] - """ - return self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][ - 'title']['runs'][0]['text'] - - @property - def description(self) -> str: - return self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][ - 'description']['simpleText'] - - @property - def length(self): - """Extract the number of videos in the playlist. - - :return: Playlist video count - :rtype: int - """ - count_text = self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][ - 'stats'][0]['runs'][0]['text'] - count_text = count_text.replace(',','') - return int(count_text) - - @property - def views(self): - """Extract view count for playlist. - - :return: Playlist view count - :rtype: int - """ - # "1,234,567 views" - views_text = self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][ - 'stats'][1]['simpleText'] - # "1,234,567" - count_text = views_text.split()[0] - # "1234567" - count_text = count_text.replace(',', '') - return int(count_text) - - @property - def owner(self): - """Extract the owner of the playlist. - - :return: Playlist owner name. - :rtype: str - """ - return self.sidebar_info[1]['playlistSidebarSecondaryInfoRenderer'][ - 'videoOwner']['videoOwnerRenderer']['title']['runs'][0]['text'] - - @property - def owner_id(self): - """Extract the channel_id of the owner of the playlist. - - :return: Playlist owner's channel ID. - :rtype: str - """ - return self.sidebar_info[1]['playlistSidebarSecondaryInfoRenderer'][ - 'videoOwner']['videoOwnerRenderer']['title']['runs'][0][ - 'navigationEndpoint']['browseEndpoint']['browseId'] - - @property - def owner_url(self): - """Create the channel url of the owner of the playlist. - - :return: Playlist owner's channel url. - :rtype: str - """ - return f'https://www.youtube.com/channel/{self.owner_id}' - - @staticmethod - def _video_url(watch_path: str): - return f"https://www.youtube.com{watch_path}" diff --git a/001-Downloader/pytube/contrib/search.py b/001-Downloader/pytube/contrib/search.py deleted file mode 100755 index a10f00c..0000000 --- a/001-Downloader/pytube/contrib/search.py +++ /dev/null @@ -1,225 +0,0 @@ -"""Module for interacting with YouTube search.""" -# Native python imports -import logging - -# Local imports -from pytube import YouTube -from pytube.innertube import InnerTube - - -logger = logging.getLogger(__name__) - - -class Search: - def __init__(self, query): - """Initialize Search object. - - :param str query: - Search query provided by the user. - """ - self.query = query - self._innertube_client = InnerTube() - - # The first search, without a continuation, is structured differently - # and contains completion suggestions, so we must store this separately - self._initial_results = None - - self._results = None - self._completion_suggestions = None - - # Used for keeping track of query continuations so that new results - # are always returned when get_next_results() is called - self._current_continuation = None - - @property - def completion_suggestions(self): - """Return query autocompletion suggestions for the query. - - :rtype: list - :returns: - A list of autocomplete suggestions provided by YouTube for the query. - """ - if self._completion_suggestions: - return self._completion_suggestions - if self.results: - self._completion_suggestions = self._initial_results['refinements'] - return self._completion_suggestions - - @property - def results(self): - """Return search results. - - On first call, will generate and return the first set of results. - Additional results can be generated using ``.get_next_results()``. - - :rtype: list - :returns: - A list of YouTube objects. - """ - if self._results: - return self._results - - videos, continuation = self.fetch_and_parse() - self._results = videos - self._current_continuation = continuation - return self._results - - def get_next_results(self): - """Use the stored continuation string to fetch the next set of results. - - This method does not return the results, but instead updates the results property. - """ - if self._current_continuation: - videos, continuation = self.fetch_and_parse(self._current_continuation) - self._results.extend(videos) - self._current_continuation = continuation - else: - raise IndexError - - def fetch_and_parse(self, continuation=None): - """Fetch from the innertube API and parse the results. - - :param str continuation: - Continuation string for fetching results. - :rtype: tuple - :returns: - A tuple of a list of YouTube objects and a continuation string. - """ - # Begin by executing the query and identifying the relevant sections - # of the results - raw_results = self.fetch_query(continuation) - - # Initial result is handled by try block, continuations by except block - try: - sections = raw_results['contents']['twoColumnSearchResultsRenderer'][ - 'primaryContents']['sectionListRenderer']['contents'] - except KeyError: - sections = raw_results['onResponseReceivedCommands'][0][ - 'appendContinuationItemsAction']['continuationItems'] - item_renderer = None - continuation_renderer = None - for s in sections: - if 'itemSectionRenderer' in s: - item_renderer = s['itemSectionRenderer'] - if 'continuationItemRenderer' in s: - continuation_renderer = s['continuationItemRenderer'] - - # If the continuationItemRenderer doesn't exist, assume no further results - if continuation_renderer: - next_continuation = continuation_renderer['continuationEndpoint'][ - 'continuationCommand']['token'] - else: - next_continuation = None - - # If the itemSectionRenderer doesn't exist, assume no results. - if item_renderer: - videos = [] - raw_video_list = item_renderer['contents'] - for video_details in raw_video_list: - # Skip over ads - if video_details.get('searchPyvRenderer', {}).get('ads', None): - continue - - # Skip "recommended" type videos e.g. "people also watched" and "popular X" - # that break up the search results - if 'shelfRenderer' in video_details: - continue - - # Skip auto-generated "mix" playlist results - if 'radioRenderer' in video_details: - continue - - # Skip playlist results - if 'playlistRenderer' in video_details: - continue - - # Skip channel results - if 'channelRenderer' in video_details: - continue - - # Skip 'people also searched for' results - if 'horizontalCardListRenderer' in video_details: - continue - - # Can't seem to reproduce, probably related to typo fix suggestions - if 'didYouMeanRenderer' in video_details: - continue - - # Seems to be the renderer used for the image shown on a no results page - if 'backgroundPromoRenderer' in video_details: - continue - - if 'videoRenderer' not in video_details: - logger.warn('Unexpected renderer encountered.') - logger.warn(f'Renderer name: {video_details.keys()}') - logger.warn(f'Search term: {self.query}') - logger.warn( - 'Please open an issue at ' - 'https://github.com/pytube/pytube/issues ' - 'and provide this log output.' - ) - continue - - # Extract relevant video information from the details. - # Some of this can be used to pre-populate attributes of the - # YouTube object. - vid_renderer = video_details['videoRenderer'] - vid_id = vid_renderer['videoId'] - vid_url = f'https://www.youtube.com/watch?v={vid_id}' - vid_title = vid_renderer['title']['runs'][0]['text'] - vid_channel_name = vid_renderer['ownerText']['runs'][0]['text'] - vid_channel_uri = vid_renderer['ownerText']['runs'][0][ - 'navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - # Livestreams have "runs", non-livestreams have "simpleText", - # and scheduled releases do not have 'viewCountText' - if 'viewCountText' in vid_renderer: - if 'runs' in vid_renderer['viewCountText']: - vid_view_count_text = vid_renderer['viewCountText']['runs'][0]['text'] - else: - vid_view_count_text = vid_renderer['viewCountText']['simpleText'] - # Strip ' views' text, then remove commas - stripped_text = vid_view_count_text.split()[0].replace(',','') - if stripped_text == 'No': - vid_view_count = 0 - else: - vid_view_count = int(stripped_text) - else: - vid_view_count = 0 - if 'lengthText' in vid_renderer: - vid_length = vid_renderer['lengthText']['simpleText'] - else: - vid_length = None - - vid_metadata = { - 'id': vid_id, - 'url': vid_url, - 'title': vid_title, - 'channel_name': vid_channel_name, - 'channel_url': vid_channel_uri, - 'view_count': vid_view_count, - 'length': vid_length - } - - # Construct YouTube object from metadata and append to results - vid = YouTube(vid_metadata['url']) - vid.author = vid_metadata['channel_name'] - vid.title = vid_metadata['title'] - videos.append(vid) - else: - videos = None - - return videos, next_continuation - - def fetch_query(self, continuation=None): - """Fetch raw results from the innertube API. - - :param str continuation: - Continuation string for fetching results. - :rtype: dict - :returns: - The raw json object returned by the innertube API. - """ - query_results = self._innertube_client.search(self.query, continuation) - if not self._initial_results: - self._initial_results = query_results - return query_results # noqa:R504 diff --git a/001-Downloader/pytube/exceptions.py b/001-Downloader/pytube/exceptions.py deleted file mode 100755 index ec44d2a..0000000 --- a/001-Downloader/pytube/exceptions.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Library specific exception definitions.""" -from typing import Pattern, Union - - -class PytubeError(Exception): - """Base pytube exception that all others inherit. - - This is done to not pollute the built-in exceptions, which *could* result - in unintended errors being unexpectedly and incorrectly handled within - implementers code. - """ - - -class MaxRetriesExceeded(PytubeError): - """Maximum number of retries exceeded.""" - - -class HTMLParseError(PytubeError): - """HTML could not be parsed""" - - -class ExtractError(PytubeError): - """Data extraction based exception.""" - - -class RegexMatchError(ExtractError): - """Regex pattern did not return any matches.""" - - def __init__(self, caller: str, pattern: Union[str, Pattern]): - """ - :param str caller: - Calling function - :param str pattern: - Pattern that failed to match - """ - super().__init__(f"{caller}: could not find match for {pattern}") - self.caller = caller - self.pattern = pattern - - -class VideoUnavailable(PytubeError): - """Base video unavailable error.""" - def __init__(self, video_id: str): - """ - :param str video_id: - A YouTube video identifier. - """ - self.video_id = video_id - super().__init__(self.error_string) - - @property - def error_string(self): - return f'{self.video_id} is unavailable' - - -class AgeRestrictedError(VideoUnavailable): - """Video is age restricted, and cannot be accessed without OAuth.""" - def __init__(self, video_id: str): - """ - :param str video_id: - A YouTube video identifier. - """ - self.video_id = video_id - super().__init__(self.video_id) - - @property - def error_string(self): - return f"{self.video_id} is age restricted, and can't be accessed without logging in." - - -class LiveStreamError(VideoUnavailable): - """Video is a live stream.""" - def __init__(self, video_id: str): - """ - :param str video_id: - A YouTube video identifier. - """ - self.video_id = video_id - super().__init__(self.video_id) - - @property - def error_string(self): - return f'{self.video_id} is streaming live and cannot be loaded' - - -class VideoPrivate(VideoUnavailable): - def __init__(self, video_id: str): - """ - :param str video_id: - A YouTube video identifier. - """ - self.video_id = video_id - super().__init__(self.video_id) - - @property - def error_string(self): - return f'{self.video_id} is a private video' - - -class RecordingUnavailable(VideoUnavailable): - def __init__(self, video_id: str): - """ - :param str video_id: - A YouTube video identifier. - """ - self.video_id = video_id - super().__init__(self.video_id) - - @property - def error_string(self): - return f'{self.video_id} does not have a live stream recording available' - - -class MembersOnly(VideoUnavailable): - """Video is members-only. - - YouTube has special videos that are only viewable to users who have - subscribed to a content creator. - ref: https://support.google.com/youtube/answer/7544492?hl=en - """ - def __init__(self, video_id: str): - """ - :param str video_id: - A YouTube video identifier. - """ - self.video_id = video_id - super().__init__(self.video_id) - - @property - def error_string(self): - return f'{self.video_id} is a members-only video' - - -class VideoRegionBlocked(VideoUnavailable): - def __init__(self, video_id: str): - """ - :param str video_id: - A YouTube video identifier. - """ - self.video_id = video_id - super().__init__(self.video_id) - - @property - def error_string(self): - return f'{self.video_id} is not available in your region' diff --git a/001-Downloader/pytube/extract.py b/001-Downloader/pytube/extract.py deleted file mode 100755 index d083214..0000000 --- a/001-Downloader/pytube/extract.py +++ /dev/null @@ -1,579 +0,0 @@ -"""This module contains all non-cipher related data extraction logic.""" -import logging -import urllib.parse -import re -from collections import OrderedDict -from datetime import datetime -from typing import Any, Dict, List, Optional, Tuple -from urllib.parse import parse_qs, quote, urlencode, urlparse - -from pytube.cipher import Cipher -from pytube.exceptions import HTMLParseError, LiveStreamError, RegexMatchError -from pytube.helpers import regex_search -from pytube.metadata import YouTubeMetadata -from pytube.parser import parse_for_object, parse_for_all_objects - - -logger = logging.getLogger(__name__) - - -def publish_date(watch_html: str): - """Extract publish date - :param str watch_html: - The html contents of the watch page. - :rtype: str - :returns: - Publish date of the video. - """ - try: - result = regex_search( - r"(?<=itemprop=\"datePublished\" content=\")\d{4}-\d{2}-\d{2}", - watch_html, group=0 - ) - except RegexMatchError: - return None - return datetime.strptime(result, '%Y-%m-%d') - - -def recording_available(watch_html): - """Check if live stream recording is available. - - :param str watch_html: - The html contents of the watch page. - :rtype: bool - :returns: - Whether or not the content is private. - """ - unavailable_strings = [ - 'This live stream recording is not available.' - ] - for string in unavailable_strings: - if string in watch_html: - return False - return True - - -def is_private(watch_html): - """Check if content is private. - - :param str watch_html: - The html contents of the watch page. - :rtype: bool - :returns: - Whether or not the content is private. - """ - private_strings = [ - "This is a private video. Please sign in to verify that you may see it.", - "\"simpleText\":\"Private video\"", - "This video is private." - ] - for string in private_strings: - if string in watch_html: - return True - return False - - -def is_age_restricted(watch_html: str) -> bool: - """Check if content is age restricted. - - :param str watch_html: - The html contents of the watch page. - :rtype: bool - :returns: - Whether or not the content is age restricted. - """ - try: - regex_search(r"og:restrictions:age", watch_html, group=0) - except RegexMatchError: - return False - return True - - -def playability_status(watch_html: str) -> (str, str): - """Return the playability status and status explanation of a video. - - For example, a video may have a status of LOGIN_REQUIRED, and an explanation - of "This is a private video. Please sign in to verify that you may see it." - - This explanation is what gets incorporated into the media player overlay. - - :param str watch_html: - The html contents of the watch page. - :rtype: bool - :returns: - Playability status and reason of the video. - """ - player_response = initial_player_response(watch_html) - status_dict = player_response.get('playabilityStatus', {}) - if 'liveStreamability' in status_dict: - return 'LIVE_STREAM', 'Video is a live stream.' - if 'status' in status_dict: - if 'reason' in status_dict: - return status_dict['status'], [status_dict['reason']] - if 'messages' in status_dict: - return status_dict['status'], status_dict['messages'] - return None, [None] - - -def video_id(url: str) -> str: - """Extract the ``video_id`` from a YouTube url. - - This function supports the following patterns: - - - :samp:`https://youtube.com/watch?v={video_id}` - - :samp:`https://youtube.com/embed/{video_id}` - - :samp:`https://youtu.be/{video_id}` - - :param str url: - A YouTube url containing a video id. - :rtype: str - :returns: - YouTube video id. - """ - return regex_search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url, group=1) - - -def playlist_id(url: str) -> str: - """Extract the ``playlist_id`` from a YouTube url. - - This function supports the following patterns: - - - :samp:`https://youtube.com/playlist?list={playlist_id}` - - :samp:`https://youtube.com/watch?v={video_id}&list={playlist_id}` - - :param str url: - A YouTube url containing a playlist id. - :rtype: str - :returns: - YouTube playlist id. - """ - parsed = urllib.parse.urlparse(url) - return parse_qs(parsed.query)['list'][0] - - -def channel_name(url: str) -> str: - """Extract the ``channel_name`` or ``channel_id`` from a YouTube url. - - This function supports the following patterns: - - - :samp:`https://youtube.com/c/{channel_name}/*` - - :samp:`https://youtube.com/channel/{channel_id}/* - - :samp:`https://youtube.com/u/{channel_name}/*` - - :samp:`https://youtube.com/user/{channel_id}/* - - :param str url: - A YouTube url containing a channel name. - :rtype: str - :returns: - YouTube channel name. - """ - patterns = [ - r"(?:\/(c)\/([%\d\w_\-]+)(\/.*)?)", - r"(?:\/(channel)\/([%\w\d_\-]+)(\/.*)?)", - r"(?:\/(u)\/([%\d\w_\-]+)(\/.*)?)", - r"(?:\/(user)\/([%\w\d_\-]+)(\/.*)?)" - ] - for pattern in patterns: - regex = re.compile(pattern) - function_match = regex.search(url) - if function_match: - logger.debug("finished regex search, matched: %s", pattern) - uri_style = function_match.group(1) - uri_identifier = function_match.group(2) - return f'/{uri_style}/{uri_identifier}' - - raise RegexMatchError( - caller="channel_name", pattern="patterns" - ) - - -def video_info_url(video_id: str, watch_url: str) -> str: - """Construct the video_info url. - - :param str video_id: - A YouTube video identifier. - :param str watch_url: - A YouTube watch url. - :rtype: str - :returns: - :samp:`https://youtube.com/get_video_info` with necessary GET - parameters. - """ - params = OrderedDict( - [ - ("video_id", video_id), - ("ps", "default"), - ("eurl", quote(watch_url)), - ("hl", "en_US"), - ("html5", "1"), - ("c", "TVHTML5"), - ("cver", "7.20201028"), - ] - ) - return _video_info_url(params) - - -def video_info_url_age_restricted(video_id: str, embed_html: str) -> str: - """Construct the video_info url. - - :param str video_id: - A YouTube video identifier. - :param str embed_html: - The html contents of the embed page (for age restricted videos). - :rtype: str - :returns: - :samp:`https://youtube.com/get_video_info` with necessary GET - parameters. - """ - try: - sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1) - except RegexMatchError: - sts = "" - # Here we use ``OrderedDict`` so that the output is consistent between - # Python 2.7+. - eurl = f"https://youtube.googleapis.com/v/{video_id}" - params = OrderedDict( - [ - ("video_id", video_id), - ("eurl", eurl), - ("sts", sts), - ("html5", "1"), - ("c", "TVHTML5"), - ("cver", "7.20201028"), - ] - ) - return _video_info_url(params) - - -def _video_info_url(params: OrderedDict) -> str: - return "https://www.youtube.com/get_video_info?" + urlencode(params) - - -def js_url(html: str) -> str: - """Get the base JavaScript url. - - Construct the base JavaScript url, which contains the decipher - "transforms". - - :param str html: - The html contents of the watch page. - """ - try: - base_js = get_ytplayer_config(html)['assets']['js'] - except (KeyError, RegexMatchError): - base_js = get_ytplayer_js(html) - return "https://youtube.com" + base_js - - -def mime_type_codec(mime_type_codec: str) -> Tuple[str, List[str]]: - """Parse the type data. - - Breaks up the data in the ``type`` key of the manifest, which contains the - mime type and codecs serialized together, and splits them into separate - elements. - - **Example**: - - mime_type_codec('audio/webm; codecs="opus"') -> ('audio/webm', ['opus']) - - :param str mime_type_codec: - String containing mime type and codecs. - :rtype: tuple - :returns: - The mime type and a list of codecs. - - """ - pattern = r"(\w+\/\w+)\;\scodecs=\"([a-zA-Z-0-9.,\s]*)\"" - regex = re.compile(pattern) - results = regex.search(mime_type_codec) - if not results: - raise RegexMatchError(caller="mime_type_codec", pattern=pattern) - mime_type, codecs = results.groups() - return mime_type, [c.strip() for c in codecs.split(",")] - - -def get_ytplayer_js(html: str) -> Any: - """Get the YouTube player base JavaScript path. - - :param str html - The html contents of the watch page. - :rtype: str - :returns: - Path to YouTube's base.js file. - """ - js_url_patterns = [ - r"(/s/player/[\w\d]+/[\w\d_/.]+/base\.js)" - ] - for pattern in js_url_patterns: - regex = re.compile(pattern) - function_match = regex.search(html) - if function_match: - logger.debug("finished regex search, matched: %s", pattern) - yt_player_js = function_match.group(1) - return yt_player_js - - raise RegexMatchError( - caller="get_ytplayer_js", pattern="js_url_patterns" - ) - - -def get_ytplayer_config(html: str) -> Any: - """Get the YouTube player configuration data from the watch html. - - Extract the ``ytplayer_config``, which is json data embedded within the - watch html and serves as the primary source of obtaining the stream - manifest data. - - :param str html: - The html contents of the watch page. - :rtype: str - :returns: - Substring of the html containing the encoded manifest data. - """ - logger.debug("finding initial function name") - config_patterns = [ - r"ytplayer\.config\s*=\s*", - r"ytInitialPlayerResponse\s*=\s*" - ] - for pattern in config_patterns: - # Try each pattern consecutively if they don't find a match - try: - return parse_for_object(html, pattern) - except HTMLParseError as e: - logger.debug(f'Pattern failed: {pattern}') - logger.debug(e) - continue - - # setConfig() needs to be handled a little differently. - # We want to parse the entire argument to setConfig() - # and use then load that as json to find PLAYER_CONFIG - # inside of it. - setconfig_patterns = [ - r"yt\.setConfig\(.*['\"]PLAYER_CONFIG['\"]:\s*" - ] - for pattern in setconfig_patterns: - # Try each pattern consecutively if they don't find a match - try: - return parse_for_object(html, pattern) - except HTMLParseError: - continue - - raise RegexMatchError( - caller="get_ytplayer_config", pattern="config_patterns, setconfig_patterns" - ) - - -def get_ytcfg(html: str) -> str: - """Get the entirety of the ytcfg object. - - This is built over multiple pieces, so we have to find all matches and - combine the dicts together. - - :param str html: - The html contents of the watch page. - :rtype: str - :returns: - Substring of the html containing the encoded manifest data. - """ - ytcfg = {} - ytcfg_patterns = [ - r"ytcfg\s=\s", - r"ytcfg\.set\(" - ] - for pattern in ytcfg_patterns: - # Try each pattern consecutively and try to build a cohesive object - try: - found_objects = parse_for_all_objects(html, pattern) - for obj in found_objects: - ytcfg.update(obj) - except HTMLParseError: - continue - - if len(ytcfg) > 0: - return ytcfg - - raise RegexMatchError( - caller="get_ytcfg", pattern="ytcfg_pattenrs" - ) - - -def apply_signature(stream_manifest: Dict, vid_info: Dict, js: str) -> None: - """Apply the decrypted signature to the stream manifest. - - :param dict stream_manifest: - Details of the media streams available. - :param str js: - The contents of the base.js asset file. - - """ - cipher = Cipher(js=js) - - for i, stream in enumerate(stream_manifest): - try: - url: str = stream["url"] - except KeyError: - live_stream = ( - vid_info.get("playabilityStatus", {},) - .get("liveStreamability") - ) - if live_stream: - raise LiveStreamError("UNKNOWN") - # 403 Forbidden fix. - if "signature" in url or ( - "s" not in stream and ("&sig=" in url or "&lsig=" in url) - ): - # For certain videos, YouTube will just provide them pre-signed, in - # which case there's no real magic to download them and we can skip - # the whole signature descrambling entirely. - logger.debug("signature found, skip decipher") - continue - - signature = cipher.get_signature(ciphered_signature=stream["s"]) - - logger.debug( - "finished descrambling signature for itag=%s", stream["itag"] - ) - parsed_url = urlparse(url) - - # Convert query params off url to dict - query_params = parse_qs(urlparse(url).query) - query_params = { - k: v[0] for k,v in query_params.items() - } - query_params['sig'] = signature - if 'ratebypass' not in query_params.keys(): - # Cipher n to get the updated value - - initial_n = list(query_params['n']) - new_n = cipher.calculate_n(initial_n) - query_params['n'] = new_n - - url = f'{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}?{urlencode(query_params)}' # noqa:E501 - - # 403 forbidden fix - stream_manifest[i]["url"] = url - - -def apply_descrambler(stream_data: Dict) -> None: - """Apply various in-place transforms to YouTube's media stream data. - - Creates a ``list`` of dictionaries by string splitting on commas, then - taking each list item, parsing it as a query string, converting it to a - ``dict`` and unquoting the value. - - :param dict stream_data: - Dictionary containing query string encoded values. - - **Example**: - - >>> d = {'foo': 'bar=1&var=test,em=5&t=url%20encoded'} - >>> apply_descrambler(d, 'foo') - >>> print(d) - {'foo': [{'bar': '1', 'var': 'test'}, {'em': '5', 't': 'url encoded'}]} - - """ - if 'url' in stream_data: - return None - - # Merge formats and adaptiveFormats into a single list - formats = [] - if 'formats' in stream_data.keys(): - formats.extend(stream_data['formats']) - if 'adaptiveFormats' in stream_data.keys(): - formats.extend(stream_data['adaptiveFormats']) - - # Extract url and s from signatureCiphers as necessary - for data in formats: - if 'url' not in data: - if 'signatureCipher' in data: - cipher_url = parse_qs(data['signatureCipher']) - data['url'] = cipher_url['url'][0] - data['s'] = cipher_url['s'][0] - data['is_otf'] = data.get('type') == 'FORMAT_STREAM_TYPE_OTF' - - logger.debug("applying descrambler") - return formats - - -def initial_data(watch_html: str) -> str: - """Extract the ytInitialData json from the watch_html page. - - This mostly contains metadata necessary for rendering the page on-load, - such as video information, copyright notices, etc. - - @param watch_html: Html of the watch page - @return: - """ - patterns = [ - r"window\[['\"]ytInitialData['\"]]\s*=\s*", - r"ytInitialData\s*=\s*" - ] - for pattern in patterns: - try: - return parse_for_object(watch_html, pattern) - except HTMLParseError: - pass - - raise RegexMatchError(caller='initial_data', pattern='initial_data_pattern') - - -def initial_player_response(watch_html: str) -> str: - """Extract the ytInitialPlayerResponse json from the watch_html page. - - This mostly contains metadata necessary for rendering the page on-load, - such as video information, copyright notices, etc. - - @param watch_html: Html of the watch page - @return: - """ - patterns = [ - r"window\[['\"]ytInitialPlayerResponse['\"]]\s*=\s*", - r"ytInitialPlayerResponse\s*=\s*" - ] - for pattern in patterns: - try: - return parse_for_object(watch_html, pattern) - except HTMLParseError: - pass - - raise RegexMatchError( - caller='initial_player_response', - pattern='initial_player_response_pattern' - ) - - -def metadata(initial_data) -> Optional[YouTubeMetadata]: - """Get the informational metadata for the video. - - e.g.: - [ - { - 'Song': '강남스타일(Gangnam Style)', - 'Artist': 'PSY', - 'Album': 'PSY SIX RULES Pt.1', - 'Licensed to YouTube by': 'YG Entertainment Inc. [...]' - } - ] - - :rtype: YouTubeMetadata - """ - try: - metadata_rows: List = initial_data["contents"]["twoColumnWatchNextResults"][ - "results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"][ - "metadataRowContainer"]["metadataRowContainerRenderer"]["rows"] - except (KeyError, IndexError): - # If there's an exception accessing this data, it probably doesn't exist. - return YouTubeMetadata([]) - - # Rows appear to only have "metadataRowRenderer" or "metadataRowHeaderRenderer" - # and we only care about the former, so we filter the others - metadata_rows = filter( - lambda x: "metadataRowRenderer" in x.keys(), - metadata_rows - ) - - # We then access the metadataRowRenderer key in each element - # and build a metadata object from this new list - metadata_rows = [x["metadataRowRenderer"] for x in metadata_rows] - - return YouTubeMetadata(metadata_rows) diff --git a/001-Downloader/pytube/helpers.py b/001-Downloader/pytube/helpers.py deleted file mode 100755 index 4cf02eb..0000000 --- a/001-Downloader/pytube/helpers.py +++ /dev/null @@ -1,335 +0,0 @@ -"""Various helper functions implemented by pytube.""" -import functools -import gzip -import json -import logging -import os -import re -import warnings -from typing import Any, Callable, Dict, List, Optional, TypeVar -from urllib import request - -from pytube.exceptions import RegexMatchError - -logger = logging.getLogger(__name__) - - -class DeferredGeneratorList: - """A wrapper class for deferring list generation. - - Pytube has some continuation generators that create web calls, which means - that any time a full list is requested, all of those web calls must be - made at once, which could lead to slowdowns. This will allow individual - elements to be queried, so that slowdowns only happen as necessary. For - example, you can iterate over elements in the list without accessing them - all simultaneously. This should allow for speed improvements for playlist - and channel interactions. - """ - def __init__(self, generator): - """Construct a :class:`DeferredGeneratorList `. - - :param generator generator: - The deferrable generator to create a wrapper for. - :param func func: - (Optional) A function to call on the generator items to produce the list. - """ - self.gen = generator - self._elements = [] - - def __eq__(self, other): - """We want to mimic list behavior for comparison.""" - return list(self) == other - - def __getitem__(self, key) -> Any: - """Only generate items as they're asked for.""" - # We only allow querying with indexes. - if not isinstance(key, (int, slice)): - raise TypeError('Key must be either a slice or int.') - - # Convert int keys to slice - key_slice = key - if isinstance(key, int): - key_slice = slice(key, key + 1, 1) - - # Generate all elements up to the final item - while len(self._elements) < key_slice.stop: - try: - next_item = next(self.gen) - except StopIteration: - # If we can't find enough elements for the slice, raise an IndexError - raise IndexError - else: - self._elements.append(next_item) - - return self._elements[key] - - def __iter__(self): - """Custom iterator for dynamically generated list.""" - iter_index = 0 - while True: - try: - curr_item = self[iter_index] - except IndexError: - return - else: - yield curr_item - iter_index += 1 - - def __next__(self) -> Any: - """Fetch next element in iterator.""" - try: - curr_element = self[self.iter_index] - except IndexError: - raise StopIteration - self.iter_index += 1 - return curr_element # noqa:R504 - - def __len__(self) -> int: - """Return length of list of all items.""" - self.generate_all() - return len(self._elements) - - def __repr__(self) -> str: - """String representation of all items.""" - self.generate_all() - return str(self._elements) - - def __reversed__(self): - self.generate_all() - return self._elements[::-1] - - def generate_all(self): - """Generate all items.""" - while True: - try: - next_item = next(self.gen) - except StopIteration: - break - else: - self._elements.append(next_item) - - -def regex_search(pattern: str, string: str, group: int) -> str: - """Shortcut method to search a string for a given pattern. - - :param str pattern: - A regular expression pattern. - :param str string: - A target string to search. - :param int group: - Index of group to return. - :rtype: - str or tuple - :returns: - Substring pattern matches. - """ - regex = re.compile(pattern) - results = regex.search(string) - if not results: - raise RegexMatchError(caller="regex_search", pattern=pattern) - - logger.debug("matched regex search: %s", pattern) - - return results.group(group) - - -def safe_filename(s: str, max_length: int = 255) -> str: - """Sanitize a string making it safe to use as a filename. - - This function was based off the limitations outlined here: - https://en.wikipedia.org/wiki/Filename. - - :param str s: - A string to make safe for use as a file name. - :param int max_length: - The maximum filename character length. - :rtype: str - :returns: - A sanitized string. - """ - # Characters in range 0-31 (0x00-0x1F) are not allowed in ntfs filenames. - ntfs_characters = [chr(i) for i in range(0, 31)] - characters = [ - r'"', - r"\#", - r"\$", - r"\%", - r"'", - r"\*", - r"\,", - r"\.", - r"\/", - r"\:", - r'"', - r"\;", - r"\<", - r"\>", - r"\?", - r"\\", - r"\^", - r"\|", - r"\~", - r"\\\\", - ] - pattern = "|".join(ntfs_characters + characters) - regex = re.compile(pattern, re.UNICODE) - filename = regex.sub("", s) - return filename[:max_length].rsplit(" ", 0)[0] - - -def setup_logger(level: int = logging.ERROR, log_filename: Optional[str] = None) -> None: - """Create a configured instance of logger. - - :param int level: - Describe the severity level of the logs to handle. - """ - fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s" - date_fmt = "%H:%M:%S" - formatter = logging.Formatter(fmt, datefmt=date_fmt) - - # https://github.com/pytube/pytube/issues/163 - logger = logging.getLogger("pytube") - logger.setLevel(level) - - stream_handler = logging.StreamHandler() - stream_handler.setFormatter(formatter) - logger.addHandler(stream_handler) - - if log_filename is not None: - file_handler = logging.FileHandler(log_filename) - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) - - -GenericType = TypeVar("GenericType") - - -def cache(func: Callable[..., GenericType]) -> GenericType: - """ mypy compatible annotation wrapper for lru_cache""" - return functools.lru_cache()(func) # type: ignore - - -def deprecated(reason: str) -> Callable: - """ - This is a decorator which can be used to mark functions - as deprecated. It will result in a warning being emitted - when the function is used. - """ - - def decorator(func1): - message = "Call to deprecated function {name} ({reason})." - - @functools.wraps(func1) - def new_func1(*args, **kwargs): - warnings.simplefilter("always", DeprecationWarning) - warnings.warn( - message.format(name=func1.__name__, reason=reason), - category=DeprecationWarning, - stacklevel=2, - ) - warnings.simplefilter("default", DeprecationWarning) - return func1(*args, **kwargs) - - return new_func1 - - return decorator - - -def target_directory(output_path: Optional[str] = None) -> str: - """ - Function for determining target directory of a download. - Returns an absolute path (if relative one given) or the current - path (if none given). Makes directory if it does not exist. - - :type output_path: str - :rtype: str - :returns: - An absolute directory path as a string. - """ - if output_path: - if not os.path.isabs(output_path): - output_path = os.path.join(os.getcwd(), output_path) - else: - output_path = os.getcwd() - os.makedirs(output_path, exist_ok=True) - return output_path - - -def install_proxy(proxy_handler: Dict[str, str]) -> None: - proxy_support = request.ProxyHandler(proxy_handler) - opener = request.build_opener(proxy_support) - request.install_opener(opener) - - -def uniqueify(duped_list: List) -> List: - """Remove duplicate items from a list, while maintaining list order. - - :param List duped_list - List to remove duplicates from - - :return List result - De-duplicated list - """ - seen: Dict[Any, bool] = {} - result = [] - for item in duped_list: - if item in seen: - continue - seen[item] = True - result.append(item) - return result - - -def generate_all_html_json_mocks(): - """Regenerate the video mock json files for all current test videos. - - This should automatically output to the test/mocks directory. - """ - test_vid_ids = [ - '2lAe1cqCOXo', - '5YceQ8YqYMc', - 'irauhITDrsE', - 'm8uHb5jIGN8', - 'QRS8MkLhQmM', - 'WXxV9g7lsFE' - ] - for vid_id in test_vid_ids: - create_mock_html_json(vid_id) - - -def create_mock_html_json(vid_id) -> Dict[str, Any]: - """Generate a json.gz file with sample html responses. - - :param str vid_id - YouTube video id - - :return dict data - Dict used to generate the json.gz file - """ - from pytube import YouTube - gzip_filename = 'yt-video-%s-html.json.gz' % vid_id - - # Get the pytube directory in order to navigate to /tests/mocks - pytube_dir_path = os.path.abspath( - os.path.join( - os.path.dirname(__file__), - os.path.pardir - ) - ) - pytube_mocks_path = os.path.join(pytube_dir_path, 'tests', 'mocks') - gzip_filepath = os.path.join(pytube_mocks_path, gzip_filename) - - yt = YouTube(f'https://www.youtube.com/watch?v={vid_id}') - html_data = { - 'url': yt.watch_url, - 'js': yt.js, - 'embed_html': yt.embed_html, - 'watch_html': yt.watch_html, - 'vid_info': yt.vid_info - } - - logger.info(f'Outputing json.gz file to {gzip_filepath}') - with gzip.open(gzip_filepath, 'wb') as f: - f.write(json.dumps(html_data).encode('utf-8')) - - return html_data diff --git a/001-Downloader/pytube/innertube.py b/001-Downloader/pytube/innertube.py deleted file mode 100755 index c5d940a..0000000 --- a/001-Downloader/pytube/innertube.py +++ /dev/null @@ -1,359 +0,0 @@ -"""This module is designed to interact with the innertube API. - -This module is NOT intended to be used directly by end users, as each of the -interfaces returns raw results. These should instead be parsed to extract -the useful information for the end user. -""" -# Native python imports -import json -import os -import pathlib -import time -from urllib import parse - -# Local imports -from pytube import request - -# YouTube on TV client secrets -_client_id = '861556708454-d6dlm3lh05idd8npek18k6be8ba3oc68.apps.googleusercontent.com' -_client_secret = 'SboVhoG9s0rNafixCSGGKXAT' - -# Extracted API keys -- unclear what these are linked to. -_api_keys = [ - 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', - 'AIzaSyCtkvNIR1HCEwzsqK6JuE6KqpyjusIRI30', - 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', - 'AIzaSyC8UYZpvA2eknNex0Pjid0_eTLJoDu6los', - 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw', - 'AIzaSyDHQ9ipnphqTzDqZsbtd8_Ru4_kiKVQe2k' -] - -_default_clients = { - 'WEB': { - 'context': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20200720.00.02' - } - }, - 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' - }, - 'ANDROID': { - 'context': { - 'client': { - 'clientName': 'ANDROID', - 'clientVersion': '16.20' - } - }, - 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' - }, - 'WEB_EMBED': { - 'context': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20210721.00.00', - 'clientScreen': 'EMBED' - } - }, - 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' - }, - 'ANDROID_EMBED': { - 'context': { - 'client': { - 'clientName': 'ANDROID', - 'clientVersion': '16.20', - 'clientScreen': 'EMBED' - } - }, - 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' - } -} -_token_timeout = 1800 -_cache_dir = pathlib.Path(__file__).parent.resolve() / '__cache__' -_token_file = os.path.join(_cache_dir, 'tokens.json') - - -class InnerTube: - """Object for interacting with the innertube API.""" - def __init__(self, client='ANDROID', use_oauth=False, allow_cache=True): - """Initialize an InnerTube object. - - :param str client: - Client to use for the object. - Default to web because it returns the most playback types. - :param bool use_oauth: - Whether or not to authenticate to YouTube. - :param bool allow_cache: - Allows caching of oauth tokens on the machine. - """ - self.context = _default_clients[client]['context'] - self.api_key = _default_clients[client]['api_key'] - self.access_token = None - self.refresh_token = None - self.use_oauth = use_oauth - self.allow_cache = allow_cache - - # Stored as epoch time - self.expires = None - - # Try to load from file if specified - if self.use_oauth and self.allow_cache: - # Try to load from file if possible - if os.path.exists(_token_file): - with open(_token_file) as f: - data = json.load(f) - self.access_token = data['access_token'] - self.refresh_token = data['refresh_token'] - self.expires = data['expires'] - self.refresh_bearer_token() - - def cache_tokens(self): - """Cache tokens to file if allowed.""" - if not self.allow_cache: - return - - data = { - 'access_token': self.access_token, - 'refresh_token': self.refresh_token, - 'expires': self.expires - } - if not os.path.exists(_cache_dir): - os.mkdir(_cache_dir) - with open(_token_file, 'w') as f: - json.dump(data, f) - - def refresh_bearer_token(self, force=False): - """Refreshes the OAuth token if necessary. - - :param bool force: - Force-refresh the bearer token. - """ - if not self.use_oauth: - return - # Skip refresh if it's not necessary and not forced - if self.expires > time.time() and not force: - return - - # Subtracting 30 seconds is arbitrary to avoid potential time discrepencies - start_time = int(time.time() - 30) - data = { - 'client_id': _client_id, - 'client_secret': _client_secret, - 'grant_type': 'refresh_token', - 'refresh_token': self.refresh_token - } - response = request._execute_request( - 'https://oauth2.googleapis.com/token', - 'POST', - headers={ - 'Content-Type': 'application/json' - }, - data=data - ) - response_data = json.loads(response.read()) - - self.access_token = response_data['access_token'] - self.expires = start_time + response_data['expires_in'] - self.cache_tokens() - - def fetch_bearer_token(self): - """Fetch an OAuth token.""" - # Subtracting 30 seconds is arbitrary to avoid potential time discrepencies - start_time = int(time.time() - 30) - data = { - 'client_id': _client_id, - 'scope': 'https://www.googleapis.com/auth/youtube' - } - response = request._execute_request( - 'https://oauth2.googleapis.com/device/code', - 'POST', - headers={ - 'Content-Type': 'application/json' - }, - data=data - ) - response_data = json.loads(response.read()) - verification_url = response_data['verification_url'] - user_code = response_data['user_code'] - print(f'Please open {verification_url} and input code {user_code}') - input('Press enter when you have completed this step.') - - data = { - 'client_id': _client_id, - 'client_secret': _client_secret, - 'device_code': response_data['device_code'], - 'grant_type': 'urn:ietf:params:oauth:grant-type:device_code' - } - response = request._execute_request( - 'https://oauth2.googleapis.com/token', - 'POST', - headers={ - 'Content-Type': 'application/json' - }, - data=data - ) - response_data = json.loads(response.read()) - - self.access_token = response_data['access_token'] - self.refresh_token = response_data['refresh_token'] - self.expires = start_time + response_data['expires_in'] - self.cache_tokens() - - @property - def base_url(self): - """Return the base url endpoint for the innertube API.""" - return 'https://www.youtube.com/youtubei/v1' - - @property - def base_data(self): - """Return the base json data to transmit to the innertube API.""" - return { - 'context': self.context - } - - @property - def base_params(self): - """Return the base query parameters to transmit to the innertube API.""" - return { - 'key': self.api_key, - 'contentCheckOk': True, - 'racyCheckOk': True - } - - def _call_api(self, endpoint, query, data): - """Make a request to a given endpoint with the provided query parameters and data.""" - # Remove the API key if oauth is being used. - if self.use_oauth: - del query['key'] - - endpoint_url = f'{endpoint}?{parse.urlencode(query)}' - headers = { - 'Content-Type': 'application/json', - } - # Add the bearer token if applicable - if self.use_oauth: - if self.access_token: - self.refresh_bearer_token() - headers['Authorization'] = f'Bearer {self.access_token}' - else: - self.fetch_bearer_token() - headers['Authorization'] = f'Bearer {self.access_token}' - - response = request._execute_request( - endpoint_url, - 'POST', - headers=headers, - data=data - ) - return json.loads(response.read()) - - def browse(self): - """Make a request to the browse endpoint. - - TODO: Figure out how we can use this - """ - # endpoint = f'{self.base_url}/browse' # noqa:E800 - ... - # return self._call_api(endpoint, query, self.base_data) # noqa:E800 - - def config(self): - """Make a request to the config endpoint. - - TODO: Figure out how we can use this - """ - # endpoint = f'{self.base_url}/config' # noqa:E800 - ... - # return self._call_api(endpoint, query, self.base_data) # noqa:E800 - - def guide(self): - """Make a request to the guide endpoint. - - TODO: Figure out how we can use this - """ - # endpoint = f'{self.base_url}/guide' # noqa:E800 - ... - # return self._call_api(endpoint, query, self.base_data) # noqa:E800 - - def next(self): - """Make a request to the next endpoint. - - TODO: Figure out how we can use this - """ - # endpoint = f'{self.base_url}/next' # noqa:E800 - ... - # return self._call_api(endpoint, query, self.base_data) # noqa:E800 - - def player(self, video_id): - """Make a request to the player endpoint. - - :param str video_id: - The video id to get player info for. - :rtype: dict - :returns: - Raw player info results. - """ - endpoint = f'{self.base_url}/player' - query = { - 'videoId': video_id, - } - query.update(self.base_params) - return self._call_api(endpoint, query, self.base_data) - - def search(self, search_query, continuation=None): - """Make a request to the search endpoint. - - :param str search_query: - The query to search. - :rtype: dict - :returns: - Raw search query results. - """ - endpoint = f'{self.base_url}/search' - query = { - 'query': search_query - } - query.update(self.base_params) - data = {} - if continuation: - data['continuation'] = continuation - data.update(self.base_data) - return self._call_api(endpoint, query, data) - - def verify_age(self, video_id): - """Make a request to the age_verify endpoint. - - Notable examples of the types of video this verification step is for: - * https://www.youtube.com/watch?v=QLdAhwSBZ3w - * https://www.youtube.com/watch?v=hc0ZDaAZQT0 - - :param str video_id: - The video id to get player info for. - :rtype: dict - :returns: - Returns information that includes a URL for bypassing certain restrictions. - """ - endpoint = f'{self.base_url}/verify_age' - data = { - 'nextEndpoint': { - 'urlEndpoint': { - 'url': f'/watch?v={video_id}' - } - }, - 'setControvercy': True - } - data.update(self.base_data) - result = self._call_api(endpoint, self.base_params, data) - return result - - def get_transcript(self, video_id): - """Make a request to the get_transcript endpoint. - - This is likely related to captioning for videos, but is currently untested. - """ - endpoint = f'{self.base_url}/get_transcript' - query = { - 'videoId': video_id, - } - query.update(self.base_params) - result = self._call_api(endpoint, query, self.base_data) - return result diff --git a/001-Downloader/pytube/itags.py b/001-Downloader/pytube/itags.py deleted file mode 100755 index 2f23cae..0000000 --- a/001-Downloader/pytube/itags.py +++ /dev/null @@ -1,144 +0,0 @@ -"""This module contains a lookup table of YouTube's itag values.""" -from typing import Dict - -PROGRESSIVE_VIDEO = { - 5: ("240p", "64kbps"), - 6: ("270p", "64kbps"), - 13: ("144p", None), - 17: ("144p", "24kbps"), - 18: ("360p", "96kbps"), - 22: ("720p", "192kbps"), - 34: ("360p", "128kbps"), - 35: ("480p", "128kbps"), - 36: ("240p", None), - 37: ("1080p", "192kbps"), - 38: ("3072p", "192kbps"), - 43: ("360p", "128kbps"), - 44: ("480p", "128kbps"), - 45: ("720p", "192kbps"), - 46: ("1080p", "192kbps"), - 59: ("480p", "128kbps"), - 78: ("480p", "128kbps"), - 82: ("360p", "128kbps"), - 83: ("480p", "128kbps"), - 84: ("720p", "192kbps"), - 85: ("1080p", "192kbps"), - 91: ("144p", "48kbps"), - 92: ("240p", "48kbps"), - 93: ("360p", "128kbps"), - 94: ("480p", "128kbps"), - 95: ("720p", "256kbps"), - 96: ("1080p", "256kbps"), - 100: ("360p", "128kbps"), - 101: ("480p", "192kbps"), - 102: ("720p", "192kbps"), - 132: ("240p", "48kbps"), - 151: ("720p", "24kbps"), - 300: ("720p", "128kbps"), - 301: ("1080p", "128kbps"), -} - -DASH_VIDEO = { - # DASH Video - 133: ("240p", None), # MP4 - 134: ("360p", None), # MP4 - 135: ("480p", None), # MP4 - 136: ("720p", None), # MP4 - 137: ("1080p", None), # MP4 - 138: ("2160p", None), # MP4 - 160: ("144p", None), # MP4 - 167: ("360p", None), # WEBM - 168: ("480p", None), # WEBM - 169: ("720p", None), # WEBM - 170: ("1080p", None), # WEBM - 212: ("480p", None), # MP4 - 218: ("480p", None), # WEBM - 219: ("480p", None), # WEBM - 242: ("240p", None), # WEBM - 243: ("360p", None), # WEBM - 244: ("480p", None), # WEBM - 245: ("480p", None), # WEBM - 246: ("480p", None), # WEBM - 247: ("720p", None), # WEBM - 248: ("1080p", None), # WEBM - 264: ("1440p", None), # MP4 - 266: ("2160p", None), # MP4 - 271: ("1440p", None), # WEBM - 272: ("4320p", None), # WEBM - 278: ("144p", None), # WEBM - 298: ("720p", None), # MP4 - 299: ("1080p", None), # MP4 - 302: ("720p", None), # WEBM - 303: ("1080p", None), # WEBM - 308: ("1440p", None), # WEBM - 313: ("2160p", None), # WEBM - 315: ("2160p", None), # WEBM - 330: ("144p", None), # WEBM - 331: ("240p", None), # WEBM - 332: ("360p", None), # WEBM - 333: ("480p", None), # WEBM - 334: ("720p", None), # WEBM - 335: ("1080p", None), # WEBM - 336: ("1440p", None), # WEBM - 337: ("2160p", None), # WEBM - 394: ("144p", None), # MP4 - 395: ("240p", None), # MP4 - 396: ("360p", None), # MP4 - 397: ("480p", None), # MP4 - 398: ("720p", None), # MP4 - 399: ("1080p", None), # MP4 - 400: ("1440p", None), # MP4 - 401: ("2160p", None), # MP4 - 402: ("4320p", None), # MP4 - 571: ("4320p", None), # MP4 -} - -DASH_AUDIO = { - # DASH Audio - 139: (None, "48kbps"), # MP4 - 140: (None, "128kbps"), # MP4 - 141: (None, "256kbps"), # MP4 - 171: (None, "128kbps"), # WEBM - 172: (None, "256kbps"), # WEBM - 249: (None, "50kbps"), # WEBM - 250: (None, "70kbps"), # WEBM - 251: (None, "160kbps"), # WEBM - 256: (None, "192kbps"), # MP4 - 258: (None, "384kbps"), # MP4 - 325: (None, None), # MP4 - 328: (None, None), # MP4 -} - -ITAGS = { - **PROGRESSIVE_VIDEO, - **DASH_VIDEO, - **DASH_AUDIO, -} - -HDR = [330, 331, 332, 333, 334, 335, 336, 337] -_3D = [82, 83, 84, 85, 100, 101, 102] -LIVE = [91, 92, 93, 94, 95, 96, 132, 151] - - -def get_format_profile(itag: int) -> Dict: - """Get additional format information for a given itag. - - :param str itag: - YouTube format identifier code. - """ - itag = int(itag) - if itag in ITAGS: - res, bitrate = ITAGS[itag] - else: - res, bitrate = None, None - return { - "resolution": res, - "abr": bitrate, - "is_live": itag in LIVE, - "is_3d": itag in _3D, - "is_hdr": itag in HDR, - "is_dash": ( - itag in DASH_AUDIO - or itag in DASH_VIDEO - ), - } diff --git a/001-Downloader/pytube/metadata.py b/001-Downloader/pytube/metadata.py deleted file mode 100755 index be12c63..0000000 --- a/001-Downloader/pytube/metadata.py +++ /dev/null @@ -1,48 +0,0 @@ -"""This module contains the YouTubeMetadata class.""" -import json -from typing import Dict, List, Optional - - -class YouTubeMetadata: - def __init__(self, metadata: List): - self._raw_metadata: List = metadata - self._metadata = [{}] - - for el in metadata: - # We only add metadata to the dict if it has a simpleText title. - if 'title' in el and 'simpleText' in el['title']: - metadata_title = el['title']['simpleText'] - else: - continue - - contents = el['contents'][0] - if 'simpleText' in contents: - self._metadata[-1][metadata_title] = contents['simpleText'] - elif 'runs' in contents: - self._metadata[-1][metadata_title] = contents['runs'][0]['text'] - - # Upon reaching a dividing line, create a new grouping - if el.get('hasDividerLine', False): - self._metadata.append({}) - - # If we happen to create an empty dict at the end, drop it - if self._metadata[-1] == {}: - self._metadata = self._metadata[:-1] - - def __getitem__(self, key): - return self._metadata[key] - - def __iter__(self): - for el in self._metadata: - yield el - - def __str__(self): - return json.dumps(self._metadata) - - @property - def raw_metadata(self) -> Optional[Dict]: - return self._raw_metadata - - @property - def metadata(self): - return self._metadata diff --git a/001-Downloader/pytube/monostate.py b/001-Downloader/pytube/monostate.py deleted file mode 100755 index 7968af5..0000000 --- a/001-Downloader/pytube/monostate.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Any, Callable, Optional - - -class Monostate: - def __init__( - self, - on_progress: Optional[Callable[[Any, bytes, int], None]], - on_complete: Optional[Callable[[Any, Optional[str]], None]], - title: Optional[str] = None, - duration: Optional[int] = None, - ): - self.on_progress = on_progress - self.on_complete = on_complete - self.title = title - self.duration = duration diff --git a/001-Downloader/pytube/parser.py b/001-Downloader/pytube/parser.py deleted file mode 100755 index 8edea35..0000000 --- a/001-Downloader/pytube/parser.py +++ /dev/null @@ -1,178 +0,0 @@ -import ast -import json -import re -from pytube.exceptions import HTMLParseError - - -def parse_for_all_objects(html, preceding_regex): - """Parses input html to find all matches for the input starting point. - - :param str html: - HTML to be parsed for an object. - :param str preceding_regex: - Regex to find the string preceding the object. - :rtype list: - :returns: - A list of dicts created from parsing the objects. - """ - result = [] - regex = re.compile(preceding_regex) - match_iter = regex.finditer(html) - for match in match_iter: - if match: - start_index = match.end() - try: - obj = parse_for_object_from_startpoint(html, start_index) - except HTMLParseError: - # Some of the instances might fail because set is technically - # a method of the ytcfg object. We'll skip these since they - # don't seem relevant at the moment. - continue - else: - result.append(obj) - - if len(result) == 0: - raise HTMLParseError(f'No matches for regex {preceding_regex}') - - return result - - -def parse_for_object(html, preceding_regex): - """Parses input html to find the end of a JavaScript object. - - :param str html: - HTML to be parsed for an object. - :param str preceding_regex: - Regex to find the string preceding the object. - :rtype dict: - :returns: - A dict created from parsing the object. - """ - regex = re.compile(preceding_regex) - result = regex.search(html) - if not result: - raise HTMLParseError(f'No matches for regex {preceding_regex}') - - start_index = result.end() - return parse_for_object_from_startpoint(html, start_index) - - -def find_object_from_startpoint(html, start_point): - """Parses input html to find the end of a JavaScript object. - - :param str html: - HTML to be parsed for an object. - :param int start_point: - Index of where the object starts. - :rtype dict: - :returns: - A dict created from parsing the object. - """ - html = html[start_point:] - if html[0] not in ['{','[']: - raise HTMLParseError(f'Invalid start point. Start of HTML:\n{html[:20]}') - - # First letter MUST be a open brace, so we put that in the stack, - # and skip the first character. - stack = [html[0]] - i = 1 - - context_closers = { - '{': '}', - '[': ']', - '"': '"' - } - - while i < len(html): - if len(stack) == 0: - break - curr_char = html[i] - curr_context = stack[-1] - - # If we've reached a context closer, we can remove an element off the stack - if curr_char == context_closers[curr_context]: - stack.pop() - i += 1 - continue - - # Strings require special context handling because they can contain - # context openers *and* closers - if curr_context == '"': - # If there's a backslash in a string, we skip a character - if curr_char == '\\': - i += 2 - continue - else: - # Non-string contexts are when we need to look for context openers. - if curr_char in context_closers.keys(): - stack.append(curr_char) - - i += 1 - - full_obj = html[:i] - return full_obj # noqa: R504 - - -def parse_for_object_from_startpoint(html, start_point): - """JSONifies an object parsed from HTML. - - :param str html: - HTML to be parsed for an object. - :param int start_point: - Index of where the object starts. - :rtype dict: - :returns: - A dict created from parsing the object. - """ - full_obj = find_object_from_startpoint(html, start_point) - try: - return json.loads(full_obj) - except json.decoder.JSONDecodeError: - try: - return ast.literal_eval(full_obj) - except (ValueError, SyntaxError): - raise HTMLParseError('Could not parse object.') - - -def throttling_array_split(js_array): - """Parses the throttling array into a python list of strings. - - Expects input to begin with `[` and close with `]`. - - :param str js_array: - The javascript array, as a string. - :rtype: list: - :returns: - A list of strings representing splits on `,` in the throttling array. - """ - results = [] - curr_substring = js_array[1:] - - comma_regex = re.compile(r",") - func_regex = re.compile(r"function\([^)]+\)") - - while len(curr_substring) > 0: - if curr_substring.startswith('function'): - # Handle functions separately. These can contain commas - match = func_regex.search(curr_substring) - match_start, match_end = match.span() - - function_text = find_object_from_startpoint(curr_substring, match.span()[1]) - full_function_def = curr_substring[:match_end + len(function_text)] - results.append(full_function_def) - curr_substring = curr_substring[len(full_function_def) + 1:] - else: - match = comma_regex.search(curr_substring) - - # Try-catch to capture end of array - try: - match_start, match_end = match.span() - except AttributeError: - match_start = len(curr_substring) - 1 - match_end = match_start + 1 - - curr_el = curr_substring[:match_start] - results.append(curr_el) - curr_substring = curr_substring[match_end:] - - return results diff --git a/001-Downloader/pytube/query.py b/001-Downloader/pytube/query.py deleted file mode 100755 index d4878ba..0000000 --- a/001-Downloader/pytube/query.py +++ /dev/null @@ -1,421 +0,0 @@ -"""This module provides a query interface for media streams and captions.""" -from collections.abc import Mapping, Sequence -from typing import Callable, List, Optional, Union - -from pytube import Caption, Stream -from pytube.helpers import deprecated - - -class StreamQuery(Sequence): - """Interface for querying the available media streams.""" - - def __init__(self, fmt_streams): - """Construct a :class:`StreamQuery `. - - param list fmt_streams: - list of :class:`Stream ` instances. - """ - self.fmt_streams = fmt_streams - self.itag_index = {int(s.itag): s for s in fmt_streams} - - def filter( - self, - fps=None, - res=None, - resolution=None, - mime_type=None, - type=None, - subtype=None, - file_extension=None, - abr=None, - bitrate=None, - video_codec=None, - audio_codec=None, - only_audio=None, - only_video=None, - progressive=None, - adaptive=None, - is_dash=None, - custom_filter_functions=None, - ): - """Apply the given filtering criterion. - - :param fps: - (optional) The frames per second. - :type fps: - int or None - - :param resolution: - (optional) Alias to ``res``. - :type res: - str or None - - :param res: - (optional) The video resolution. - :type resolution: - str or None - - :param mime_type: - (optional) Two-part identifier for file formats and format contents - composed of a "type", a "subtype". - :type mime_type: - str or None - - :param type: - (optional) Type part of the ``mime_type`` (e.g.: audio, video). - :type type: - str or None - - :param subtype: - (optional) Sub-type part of the ``mime_type`` (e.g.: mp4, mov). - :type subtype: - str or None - - :param file_extension: - (optional) Alias to ``sub_type``. - :type file_extension: - str or None - - :param abr: - (optional) Average bitrate (ABR) refers to the average amount of - data transferred per unit of time (e.g.: 64kbps, 192kbps). - :type abr: - str or None - - :param bitrate: - (optional) Alias to ``abr``. - :type bitrate: - str or None - - :param video_codec: - (optional) Video compression format. - :type video_codec: - str or None - - :param audio_codec: - (optional) Audio compression format. - :type audio_codec: - str or None - - :param bool progressive: - Excludes adaptive streams (one file contains both audio and video - tracks). - - :param bool adaptive: - Excludes progressive streams (audio and video are on separate - tracks). - - :param bool is_dash: - Include/exclude dash streams. - - :param bool only_audio: - Excludes streams with video tracks. - - :param bool only_video: - Excludes streams with audio tracks. - - :param custom_filter_functions: - (optional) Interface for defining complex filters without - subclassing. - :type custom_filter_functions: - list or None - - """ - filters = [] - if res or resolution: - filters.append(lambda s: s.resolution == (res or resolution)) - - if fps: - filters.append(lambda s: s.fps == fps) - - if mime_type: - filters.append(lambda s: s.mime_type == mime_type) - - if type: - filters.append(lambda s: s.type == type) - - if subtype or file_extension: - filters.append(lambda s: s.subtype == (subtype or file_extension)) - - if abr or bitrate: - filters.append(lambda s: s.abr == (abr or bitrate)) - - if video_codec: - filters.append(lambda s: s.video_codec == video_codec) - - if audio_codec: - filters.append(lambda s: s.audio_codec == audio_codec) - - if only_audio: - filters.append( - lambda s: ( - s.includes_audio_track and not s.includes_video_track - ), - ) - - if only_video: - filters.append( - lambda s: ( - s.includes_video_track and not s.includes_audio_track - ), - ) - - if progressive: - filters.append(lambda s: s.is_progressive) - - if adaptive: - filters.append(lambda s: s.is_adaptive) - - if custom_filter_functions: - filters.extend(custom_filter_functions) - - if is_dash is not None: - filters.append(lambda s: s.is_dash == is_dash) - - return self._filter(filters) - - def _filter(self, filters: List[Callable]) -> "StreamQuery": - fmt_streams = self.fmt_streams - for filter_lambda in filters: - fmt_streams = filter(filter_lambda, fmt_streams) - return StreamQuery(list(fmt_streams)) - - def order_by(self, attribute_name: str) -> "StreamQuery": - """Apply a sort order. Filters out stream the do not have the attribute. - - :param str attribute_name: - The name of the attribute to sort by. - """ - has_attribute = [ - s - for s in self.fmt_streams - if getattr(s, attribute_name) is not None - ] - # Check that the attributes have string values. - if has_attribute and isinstance( - getattr(has_attribute[0], attribute_name), str - ): - # Try to return a StreamQuery sorted by the integer representations - # of the values. - try: - return StreamQuery( - sorted( - has_attribute, - key=lambda s: int( - "".join( - filter(str.isdigit, getattr(s, attribute_name)) - ) - ), # type: ignore # noqa: E501 - ) - ) - except ValueError: - pass - - return StreamQuery( - sorted(has_attribute, key=lambda s: getattr(s, attribute_name)) - ) - - def desc(self) -> "StreamQuery": - """Sort streams in descending order. - - :rtype: :class:`StreamQuery ` - - """ - return StreamQuery(self.fmt_streams[::-1]) - - def asc(self) -> "StreamQuery": - """Sort streams in ascending order. - - :rtype: :class:`StreamQuery ` - - """ - return self - - def get_by_itag(self, itag: int) -> Optional[Stream]: - """Get the corresponding :class:`Stream ` for a given itag. - - :param int itag: - YouTube format identifier code. - :rtype: :class:`Stream ` or None - :returns: - The :class:`Stream ` matching the given itag or None if - not found. - - """ - return self.itag_index.get(int(itag)) - - def get_by_resolution(self, resolution: str) -> Optional[Stream]: - """Get the corresponding :class:`Stream ` for a given resolution. - - Stream must be a progressive mp4. - - :param str resolution: - Video resolution i.e. "720p", "480p", "360p", "240p", "144p" - :rtype: :class:`Stream ` or None - :returns: - The :class:`Stream ` matching the given itag or None if - not found. - - """ - return self.filter( - progressive=True, subtype="mp4", resolution=resolution - ).first() - - def get_lowest_resolution(self) -> Optional[Stream]: - """Get lowest resolution stream that is a progressive mp4. - - :rtype: :class:`Stream ` or None - :returns: - The :class:`Stream ` matching the given itag or None if - not found. - - """ - return ( - self.filter(progressive=True, subtype="mp4") - .order_by("resolution") - .first() - ) - - def get_highest_resolution(self) -> Optional[Stream]: - """Get highest resolution stream that is a progressive video. - - :rtype: :class:`Stream ` or None - :returns: - The :class:`Stream ` matching the given itag or None if - not found. - - """ - return self.filter(progressive=True).order_by("resolution").last() - - def get_audio_only(self, subtype: str = "mp4") -> Optional[Stream]: - """Get highest bitrate audio stream for given codec (defaults to mp4) - - :param str subtype: - Audio subtype, defaults to mp4 - :rtype: :class:`Stream ` or None - :returns: - The :class:`Stream ` matching the given itag or None if - not found. - """ - return ( - self.filter(only_audio=True, subtype=subtype) - .order_by("abr") - .last() - ) - - def otf(self, is_otf: bool = False) -> "StreamQuery": - """Filter stream by OTF, useful if some streams have 404 URLs - - :param bool is_otf: Set to False to retrieve only non-OTF streams - :rtype: :class:`StreamQuery ` - :returns: A StreamQuery object with otf filtered streams - """ - return self._filter([lambda s: s.is_otf == is_otf]) - - def first(self) -> Optional[Stream]: - """Get the first :class:`Stream ` in the results. - - :rtype: :class:`Stream ` or None - :returns: - the first result of this query or None if the result doesn't - contain any streams. - - """ - try: - return self.fmt_streams[0] - except IndexError: - return None - - def last(self): - """Get the last :class:`Stream ` in the results. - - :rtype: :class:`Stream ` or None - :returns: - Return the last result of this query or None if the result - doesn't contain any streams. - - """ - try: - return self.fmt_streams[-1] - except IndexError: - pass - - @deprecated("Get the size of this list directly using len()") - def count(self, value: Optional[str] = None) -> int: # pragma: no cover - """Get the count of items in the list. - - :rtype: int - """ - if value: - return self.fmt_streams.count(value) - - return len(self) - - @deprecated("This object can be treated as a list, all() is useless") - def all(self) -> List[Stream]: # pragma: no cover - """Get all the results represented by this query as a list. - - :rtype: list - - """ - return self.fmt_streams - - def __getitem__(self, i: Union[slice, int]): - return self.fmt_streams[i] - - def __len__(self) -> int: - return len(self.fmt_streams) - - def __repr__(self) -> str: - return f"{self.fmt_streams}" - - -class CaptionQuery(Mapping): - """Interface for querying the available captions.""" - - def __init__(self, captions: List[Caption]): - """Construct a :class:`Caption `. - - param list captions: - list of :class:`Caption ` instances. - - """ - self.lang_code_index = {c.code: c for c in captions} - - @deprecated( - "This object can be treated as a dictionary, i.e. captions['en']" - ) - def get_by_language_code( - self, lang_code: str - ) -> Optional[Caption]: # pragma: no cover - """Get the :class:`Caption ` for a given ``lang_code``. - - :param str lang_code: - The code that identifies the caption language. - :rtype: :class:`Caption ` or None - :returns: - The :class:`Caption ` matching the given ``lang_code`` or - None if it does not exist. - """ - return self.lang_code_index.get(lang_code) - - @deprecated("This object can be treated as a dictionary") - def all(self) -> List[Caption]: # pragma: no cover - """Get all the results represented by this query as a list. - - :rtype: list - - """ - return list(self.lang_code_index.values()) - - def __getitem__(self, i: str): - return self.lang_code_index[i] - - def __len__(self) -> int: - return len(self.lang_code_index) - - def __iter__(self): - return iter(self.lang_code_index.values()) - - def __repr__(self) -> str: - return f"{self.lang_code_index}" diff --git a/001-Downloader/pytube/request.py b/001-Downloader/pytube/request.py deleted file mode 100755 index b31b760..0000000 --- a/001-Downloader/pytube/request.py +++ /dev/null @@ -1,268 +0,0 @@ -"""Implements a simple wrapper around urlopen.""" -import http.client -import json -import logging -import re -import socket -from functools import lru_cache -from urllib import parse -from urllib.error import URLError -from urllib.request import Request, urlopen - -from pytube.exceptions import RegexMatchError, MaxRetriesExceeded -from pytube.helpers import regex_search - -import ssl -ssl._create_default_https_context = ssl._create_unverified_context - -logger = logging.getLogger(__name__) -default_range_size = 9437184 # 9MB - - -def _execute_request( - url, - method=None, - headers=None, - data=None, - timeout=socket._GLOBAL_DEFAULT_TIMEOUT -): - base_headers = {"User-Agent": "Mozilla/5.0", "accept-language": "en-US,en"} - if headers: - base_headers.update(headers) - if data: - # encode data for request - if not isinstance(data, bytes): - data = bytes(json.dumps(data), encoding="utf-8") - if url.lower().startswith("http"): - request = Request(url, headers=base_headers, method=method, data=data) - else: - raise ValueError("Invalid URL") - return urlopen(request, timeout=timeout) # nosec - - -def get(url, extra_headers=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): - """Send an http GET request. - - :param str url: - The URL to perform the GET request for. - :param dict extra_headers: - Extra headers to add to the request - :rtype: str - :returns: - UTF-8 encoded string of response - """ - if extra_headers is None: - extra_headers = {} - response = _execute_request(url, headers=extra_headers, timeout=timeout) - return response.read().decode("utf-8") - - -def post(url, extra_headers=None, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): - """Send an http POST request. - - :param str url: - The URL to perform the POST request for. - :param dict extra_headers: - Extra headers to add to the request - :param dict data: - The data to send on the POST request - :rtype: str - :returns: - UTF-8 encoded string of response - """ - # could technically be implemented in get, - # but to avoid confusion implemented like this - if extra_headers is None: - extra_headers = {} - if data is None: - data = {} - # required because the youtube servers are strict on content type - # raises HTTPError [400]: Bad Request otherwise - extra_headers.update({"Content-Type": "application/json"}) - response = _execute_request( - url, - headers=extra_headers, - data=data, - timeout=timeout - ) - return response.read().decode("utf-8") - - -def seq_stream( - url, - timeout=socket._GLOBAL_DEFAULT_TIMEOUT, - max_retries=0 -): - """Read the response in sequence. - :param str url: The URL to perform the GET request for. - :rtype: Iterable[bytes] - """ - # YouTube expects a request sequence number as part of the parameters. - split_url = parse.urlsplit(url) - base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path) - - querys = dict(parse.parse_qsl(split_url.query)) - - # The 0th sequential request provides the file headers, which tell us - # information about how the file is segmented. - querys['sq'] = 0 - url = base_url + parse.urlencode(querys) - - segment_data = b'' - for chunk in stream(url, timeout=timeout, max_retries=max_retries): - yield chunk - segment_data += chunk - - # We can then parse the header to find the number of segments - stream_info = segment_data.split(b'\r\n') - segment_count_pattern = re.compile(b'Segment-Count: (\\d+)') - for line in stream_info: - match = segment_count_pattern.search(line) - if match: - segment_count = int(match.group(1).decode('utf-8')) - - # We request these segments sequentially to build the file. - seq_num = 1 - while seq_num <= segment_count: - # Create sequential request URL - querys['sq'] = seq_num - url = base_url + parse.urlencode(querys) - - yield from stream(url, timeout=timeout, max_retries=max_retries) - seq_num += 1 - return # pylint: disable=R1711 - - -def stream( - url, - timeout=socket._GLOBAL_DEFAULT_TIMEOUT, - max_retries=0 -): - """Read the response in chunks. - :param str url: The URL to perform the GET request for. - :rtype: Iterable[bytes] - """ - file_size: int = default_range_size # fake filesize to start - downloaded = 0 - while downloaded < file_size: - stop_pos = min(downloaded + default_range_size, file_size) - 1 - range_header = f"bytes={downloaded}-{stop_pos}" - tries = 0 - - # Attempt to make the request multiple times as necessary. - while True: - # If the max retries is exceeded, raise an exception - if tries >= 1 + max_retries: - raise MaxRetriesExceeded() - - # Try to execute the request, ignoring socket timeouts - try: - response = _execute_request( - url, - method="GET", - headers={"Range": range_header}, - timeout=timeout - ) - except URLError as e: - # We only want to skip over timeout errors, and - # raise any other URLError exceptions - if isinstance(e.reason, socket.timeout): - pass - else: - raise - except http.client.IncompleteRead: - # Allow retries on IncompleteRead errors for unreliable connections - pass - else: - # On a successful request, break from loop - break - tries += 1 - - if file_size == default_range_size: - try: - content_range = response.info()["Content-Range"] - file_size = int(content_range.split("/")[1]) - except (KeyError, IndexError, ValueError) as e: - logger.error(e) - while True: - chunk = response.read() - if not chunk: - break - downloaded += len(chunk) - yield chunk - return # pylint: disable=R1711 - - -@lru_cache() -def filesize(url): - """Fetch size in bytes of file at given URL - - :param str url: The URL to get the size of - :returns: int: size in bytes of remote file - """ - return int(head(url)["content-length"]) - - -@lru_cache() -def seq_filesize(url): - """Fetch size in bytes of file at given URL from sequential requests - - :param str url: The URL to get the size of - :returns: int: size in bytes of remote file - """ - total_filesize = 0 - # YouTube expects a request sequence number as part of the parameters. - split_url = parse.urlsplit(url) - base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path) - querys = dict(parse.parse_qsl(split_url.query)) - - # The 0th sequential request provides the file headers, which tell us - # information about how the file is segmented. - querys['sq'] = 0 - url = base_url + parse.urlencode(querys) - response = _execute_request( - url, method="GET" - ) - - response_value = response.read() - # The file header must be added to the total filesize - total_filesize += len(response_value) - - # We can then parse the header to find the number of segments - segment_count = 0 - stream_info = response_value.split(b'\r\n') - segment_regex = b'Segment-Count: (\\d+)' - for line in stream_info: - # One of the lines should contain the segment count, but we don't know - # which, so we need to iterate through the lines to find it - try: - segment_count = int(regex_search(segment_regex, line, 1)) - except RegexMatchError: - pass - - if segment_count == 0: - raise RegexMatchError('seq_filesize', segment_regex) - - # We make HEAD requests to the segments sequentially to find the total filesize. - seq_num = 1 - while seq_num <= segment_count: - # Create sequential request URL - querys['sq'] = seq_num - url = base_url + parse.urlencode(querys) - - total_filesize += int(head(url)['content-length']) - seq_num += 1 - return total_filesize - - -def head(url): - """Fetch headers returned http GET request. - - :param str url: - The URL to perform the GET request for. - :rtype: dict - :returns: - dictionary of lowercase headers - """ - response_headers = _execute_request(url, method="HEAD").info() - return {k.lower(): v for k, v in response_headers.items()} diff --git a/001-Downloader/pytube/streams.py b/001-Downloader/pytube/streams.py deleted file mode 100755 index 05ec6c1..0000000 --- a/001-Downloader/pytube/streams.py +++ /dev/null @@ -1,374 +0,0 @@ -""" -This module contains a container for stream manifest data. - -A container object for the media stream (video only / audio only / video+audio -combined). This was referred to as ``Video`` in the legacy pytube version, but -has been renamed to accommodate DASH (which serves the audio and video -separately). -""" -import logging -import os -from datetime import datetime -from typing import BinaryIO, Dict, Optional, Tuple -from urllib.error import HTTPError -from urllib.parse import parse_qs - -from pytube import extract, request -from pytube.helpers import safe_filename, target_directory -from pytube.itags import get_format_profile -from pytube.monostate import Monostate - -logger = logging.getLogger(__name__) - - -class Stream: - """Container for stream manifest data.""" - - def __init__( - self, stream: Dict, monostate: Monostate - ): - """Construct a :class:`Stream `. - - :param dict stream: - The unscrambled data extracted from YouTube. - :param dict monostate: - Dictionary of data shared across all instances of - :class:`Stream `. - """ - # A dictionary shared between all instances of :class:`Stream ` - # (Borg pattern). - self._monostate = monostate - - self.url = stream["url"] # signed download url - self.itag = int( - stream["itag"] - ) # stream format id (youtube nomenclature) - - # set type and codec info - - # 'video/webm; codecs="vp8, vorbis"' -> 'video/webm', ['vp8', 'vorbis'] - self.mime_type, self.codecs = extract.mime_type_codec(stream["mimeType"]) - - # 'video/webm' -> 'video', 'webm' - self.type, self.subtype = self.mime_type.split("/") - - # ['vp8', 'vorbis'] -> video_codec: vp8, audio_codec: vorbis. DASH - # streams return NoneType for audio/video depending. - self.video_codec, self.audio_codec = self.parse_codecs() - - self.is_otf: bool = stream["is_otf"] - self.bitrate: Optional[int] = stream["bitrate"] - - # filesize in bytes - self._filesize: Optional[int] = int(stream.get('contentLength', 0)) - - # Additional information about the stream format, such as resolution, - # frame rate, and whether the stream is live (HLS) or 3D. - itag_profile = get_format_profile(self.itag) - self.is_dash = itag_profile["is_dash"] - self.abr = itag_profile["abr"] # average bitrate (audio streams only) - if 'fps' in stream: - self.fps = stream['fps'] # Video streams only - self.resolution = itag_profile[ - "resolution" - ] # resolution (e.g.: "480p") - self.is_3d = itag_profile["is_3d"] - self.is_hdr = itag_profile["is_hdr"] - self.is_live = itag_profile["is_live"] - - @property - def is_adaptive(self) -> bool: - """Whether the stream is DASH. - - :rtype: bool - """ - # if codecs has two elements (e.g.: ['vp8', 'vorbis']): 2 % 2 = 0 - # if codecs has one element (e.g.: ['vp8']) 1 % 2 = 1 - return bool(len(self.codecs) % 2) - - @property - def is_progressive(self) -> bool: - """Whether the stream is progressive. - - :rtype: bool - """ - return not self.is_adaptive - - @property - def includes_audio_track(self) -> bool: - """Whether the stream only contains audio. - - :rtype: bool - """ - return self.is_progressive or self.type == "audio" - - @property - def includes_video_track(self) -> bool: - """Whether the stream only contains video. - - :rtype: bool - """ - return self.is_progressive or self.type == "video" - - def parse_codecs(self) -> Tuple[Optional[str], Optional[str]]: - """Get the video/audio codecs from list of codecs. - - Parse a variable length sized list of codecs and returns a - constant two element tuple, with the video codec as the first element - and audio as the second. Returns None if one is not available - (adaptive only). - - :rtype: tuple - :returns: - A two element tuple with audio and video codecs. - - """ - video = None - audio = None - if not self.is_adaptive: - video, audio = self.codecs - elif self.includes_video_track: - video = self.codecs[0] - elif self.includes_audio_track: - audio = self.codecs[0] - return video, audio - - @property - def filesize(self) -> int: - """File size of the media stream in bytes. - - :rtype: int - :returns: - Filesize (in bytes) of the stream. - """ - if self._filesize == 0: - try: - self._filesize = request.filesize(self.url) - except HTTPError as e: - if e.code != 404: - raise - self._filesize = request.seq_filesize(self.url) - return self._filesize - - @property - def title(self) -> str: - """Get title of video - - :rtype: str - :returns: - Youtube video title - """ - return self._monostate.title or "Unknown YouTube Video Title" - - @property - def filesize_approx(self) -> int: - """Get approximate filesize of the video - - Falls back to HTTP call if there is not sufficient information to approximate - - :rtype: int - :returns: size of video in bytes - """ - if self._monostate.duration and self.bitrate: - bits_in_byte = 8 - return int( - (self._monostate.duration * self.bitrate) / bits_in_byte - ) - - return self.filesize - - @property - def expiration(self) -> datetime: - expire = parse_qs(self.url.split("?")[1])["expire"][0] - return datetime.utcfromtimestamp(int(expire)) - - @property - def default_filename(self) -> str: - """Generate filename based on the video title. - - :rtype: str - :returns: - An os file system compatible filename. - """ - filename = safe_filename(self.title) - return f"{filename}.{self.subtype}" - - def download( - self, - output_path: Optional[str] = None, - filename: Optional[str] = None, - filename_prefix: Optional[str] = None, - skip_existing: bool = True, - timeout: Optional[int] = None, - max_retries: Optional[int] = 0 - ) -> str: - """Write the media stream to disk. - - :param output_path: - (optional) Output path for writing media file. If one is not - specified, defaults to the current working directory. - :type output_path: str or None - :param filename: - (optional) Output filename (stem only) for writing media file. - If one is not specified, the default filename is used. - :type filename: str or None - :param filename_prefix: - (optional) A string that will be prepended to the filename. - For example a number in a playlist or the name of a series. - If one is not specified, nothing will be prepended - This is separate from filename so you can use the default - filename but still add a prefix. - :type filename_prefix: str or None - :param skip_existing: - (optional) Skip existing files, defaults to True - :type skip_existing: bool - :param timeout: - (optional) Request timeout length in seconds. Uses system default. - :type timeout: int - :param max_retries: - (optional) Number of retries to attempt after socket timeout. Defaults to 0. - :type max_retries: int - :returns: - Path to the saved video - :rtype: str - - """ - file_path = self.get_file_path( - filename=filename, - output_path=output_path, - filename_prefix=filename_prefix, - ) - - if skip_existing and self.exists_at_path(file_path): - logger.debug(f'file {file_path} already exists, skipping') - self.on_complete(file_path) - return file_path - - bytes_remaining = self.filesize - logger.debug(f'downloading ({self.filesize} total bytes) file to {file_path}') - - with open(file_path, "wb") as fh: - try: - for chunk in request.stream( - self.url, - timeout=timeout, - max_retries=max_retries - ): - # reduce the (bytes) remainder by the length of the chunk. - bytes_remaining -= len(chunk) - # send to the on_progress callback. - self.on_progress(chunk, fh, bytes_remaining) - except HTTPError as e: - if e.code != 404: - raise - # Some adaptive streams need to be requested with sequence numbers - for chunk in request.seq_stream( - self.url, - timeout=timeout, - max_retries=max_retries - ): - # reduce the (bytes) remainder by the length of the chunk. - bytes_remaining -= len(chunk) - # send to the on_progress callback. - self.on_progress(chunk, fh, bytes_remaining) - self.on_complete(file_path) - return file_path - - def get_file_path( - self, - filename: Optional[str] = None, - output_path: Optional[str] = None, - filename_prefix: Optional[str] = None, - ) -> str: - if not filename: - filename = self.default_filename - if filename_prefix: - filename = f"{filename_prefix}{filename}" - return os.path.join(target_directory(output_path), filename) - - def exists_at_path(self, file_path: str) -> bool: - return ( - os.path.isfile(file_path) - and os.path.getsize(file_path) == self.filesize - ) - - def stream_to_buffer(self, buffer: BinaryIO) -> None: - """Write the media stream to buffer - - :rtype: io.BytesIO buffer - """ - bytes_remaining = self.filesize - logger.info( - "downloading (%s total bytes) file to buffer", self.filesize, - ) - - for chunk in request.stream(self.url): - # reduce the (bytes) remainder by the length of the chunk. - bytes_remaining -= len(chunk) - # send to the on_progress callback. - self.on_progress(chunk, buffer, bytes_remaining) - self.on_complete(None) - - def on_progress( - self, chunk: bytes, file_handler: BinaryIO, bytes_remaining: int - ): - """On progress callback function. - - This function writes the binary data to the file, then checks if an - additional callback is defined in the monostate. This is exposed to - allow things like displaying a progress bar. - - :param bytes chunk: - Segment of media file binary data, not yet written to disk. - :param file_handler: - The file handle where the media is being written to. - :type file_handler: - :py:class:`io.BufferedWriter` - :param int bytes_remaining: - The delta between the total file size in bytes and amount already - downloaded. - - :rtype: None - - """ - file_handler.write(chunk) - logger.debug("download remaining: %s", bytes_remaining) - if self._monostate.on_progress: - self._monostate.on_progress(self, chunk, bytes_remaining) - - def on_complete(self, file_path: Optional[str]): - """On download complete handler function. - - :param file_path: - The file handle where the media is being written to. - :type file_path: str - - :rtype: None - - """ - logger.debug("download finished") - on_complete = self._monostate.on_complete - if on_complete: - logger.debug("calling on_complete callback %s", on_complete) - on_complete(self, file_path) - - def __repr__(self) -> str: - """Printable object representation. - - :rtype: str - :returns: - A string representation of a :class:`Stream ` object. - """ - parts = ['itag="{s.itag}"', 'mime_type="{s.mime_type}"'] - if self.includes_video_track: - parts.extend(['res="{s.resolution}"', 'fps="{s.fps}fps"']) - if not self.is_adaptive: - parts.extend( - ['vcodec="{s.video_codec}"', 'acodec="{s.audio_codec}"',] - ) - else: - parts.extend(['vcodec="{s.video_codec}"']) - else: - parts.extend(['abr="{s.abr}"', 'acodec="{s.audio_codec}"']) - parts.extend(['progressive="{s.is_progressive}"', 'type="{s.type}"']) - return f"" diff --git a/001-Downloader/pytube/version.py b/001-Downloader/pytube/version.py deleted file mode 100755 index 4168adc..0000000 --- a/001-Downloader/pytube/version.py +++ /dev/null @@ -1,4 +0,0 @@ -__version__ = "11.0.0" - -if __name__ == "__main__": - print(__version__) diff --git a/001-Downloader/test/bilibili_video_download_v1.py b/001-Downloader/test/bilibili_video_download_v1.py new file mode 100644 index 0000000..086caa2 --- /dev/null +++ b/001-Downloader/test/bilibili_video_download_v1.py @@ -0,0 +1,243 @@ +# !/usr/bin/python +# -*- coding:utf-8 -*- +# time: 2019/04/17--08:12 +__author__ = 'Henry' + +''' +项目: B站视频下载 + +版本1: 加密API版,不需要加入cookie,直接即可下载1080p视频 + +20190422 - 增加多P视频单独下载其中一集的功能 +''' + +import requests, time, hashlib, urllib.request, re, json +from moviepy.editor import * +import os, sys + + +# 访问API地址 +def get_play_list(start_url, cid, quality): + entropy = 'rbMCKn@KuamXWlPMoJGsKcbiJKUfkPF_8dABscJntvqhRSETg' + appkey, sec = ''.join([chr(ord(i) + 2) for i in entropy[::-1]]).split(':') + params = 'appkey=%s&cid=%s&otype=json&qn=%s&quality=%s&type=' % (appkey, cid, quality, quality) + chksum = hashlib.md5(bytes(params + sec, 'utf8')).hexdigest() + url_api = 'https://interface.bilibili.com/v2/playurl?%s&sign=%s' % (params, chksum) + headers = { + 'Referer': start_url, # 注意加上referer + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' + } + # print(url_api) + html = requests.get(url_api, headers=headers).json() + # print(json.dumps(html)) + video_list = [] + for i in html['durl']: + video_list.append(i['url']) + # print(video_list) + return video_list + + +# 下载视频 +''' + urllib.urlretrieve 的回调函数: +def callbackfunc(blocknum, blocksize, totalsize): + @blocknum: 已经下载的数据块 + @blocksize: 数据块的大小 + @totalsize: 远程文件的大小 +''' + + +def Schedule_cmd(blocknum, blocksize, totalsize): + speed = (blocknum * blocksize) / (time.time() - start_time) + # speed_str = " Speed: %.2f" % speed + speed_str = " Speed: %s" % format_size(speed) + recv_size = blocknum * blocksize + + # 设置下载进度条 + f = sys.stdout + pervent = recv_size / totalsize + percent_str = "%.2f%%" % (pervent * 100) + n = round(pervent * 50) + s = ('#' * n).ljust(50, '-') + f.write(percent_str.ljust(8, ' ') + '[' + s + ']' + speed_str) + f.flush() + # time.sleep(0.1) + f.write('\r') + + +def Schedule(blocknum, blocksize, totalsize): + speed = (blocknum * blocksize) / (time.time() - start_time) + # speed_str = " Speed: %.2f" % speed + speed_str = " Speed: %s" % format_size(speed) + recv_size = blocknum * blocksize + + # 设置下载进度条 + f = sys.stdout + pervent = recv_size / totalsize + percent_str = "%.2f%%" % (pervent * 100) + n = round(pervent * 50) + s = ('#' * n).ljust(50, '-') + print(percent_str.ljust(6, ' ') + '-' + speed_str) + f.flush() + time.sleep(2) + # print('\r') + + +# 字节bytes转化K\M\G +def format_size(bytes): + try: + bytes = float(bytes) + kb = bytes / 1024 + except: + print("传入的字节格式不对") + return "Error" + if kb >= 1024: + M = kb / 1024 + if M >= 1024: + G = M / 1024 + return "%.3fG" % (G) + else: + return "%.3fM" % (M) + else: + return "%.3fK" % (kb) + + +# 下载视频 +def down_video(video_list, title, start_url, page): + num = 1 + print('[正在下载P{}段视频,请稍等...]:'.format(page) + title) + currentVideoPath = os.path.join(sys.path[0], 'bilibili_video', title) # 当前目录作为下载目录 + for i in video_list: + opener = urllib.request.build_opener() + # 请求头 + opener.addheaders = [ + # ('Host', 'upos-hz-mirrorks3.acgvideo.com'), #注意修改host,不用也行 + ('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0) Gecko/20100101 Firefox/56.0'), + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('Accept-Encoding', 'gzip, deflate, br'), + ('Range', 'bytes=0-'), # Range 的值要为 bytes=0- 才能下载完整视频 + ('Referer', start_url), # 注意修改referer,必须要加的! + ('Origin', 'https://www.bilibili.com'), + ('Connection', 'keep-alive'), + ] + urllib.request.install_opener(opener) + # 创建文件夹存放下载的视频 + if not os.path.exists(currentVideoPath): + os.makedirs(currentVideoPath) + # 开始下载 + if len(video_list) > 1: + urllib.request.urlretrieve(url=i, filename=os.path.join(currentVideoPath, r'{}-{}.mp4'.format(title, num)), + reporthook=Schedule_cmd) # 写成mp4也行 title + '-' + num + '.flv' + else: + urllib.request.urlretrieve(url=i, filename=os.path.join(currentVideoPath, r'{}.mp4'.format(title)), + reporthook=Schedule_cmd) # 写成mp4也行 title + '-' + num + '.flv' + num += 1 + + +# 合并视频 +def combine_video(video_list, title): + currentVideoPath = os.path.join(sys.path[0], 'bilibili_video', title) # 当前目录作为下载目录 + if not os.path.exists(currentVideoPath): + os.makedirs(currentVideoPath) + if len(video_list) >= 2: + # 视频大于一段才要合并 + print('[下载完成,正在合并视频...]:' + title) + # 定义一个数组 + L = [] + # 访问 video 文件夹 (假设视频都放在这里面) + root_dir = currentVideoPath + # 遍历所有文件 + for file in sorted(os.listdir(root_dir), key=lambda x: int(x[x.rindex("-") + 1:x.rindex(".")])): + # 如果后缀名为 .mp4/.flv + if os.path.splitext(file)[1] == '.flv': + # 拼接成完整路径 + filePath = os.path.join(root_dir, file) + # 载入视频 + video = VideoFileClip(filePath) + # 添加到数组 + L.append(video) + # 拼接视频 + final_clip = concatenate_videoclips(L) + # 生成目标视频文件 + final_clip.to_videofile(os.path.join(root_dir, r'{}.mp4'.format(title)), fps=24, remove_temp=False) + print('[视频合并完成]' + title) + + else: + # 视频只有一段则直接打印下载完成 + print('[视频合并完成]:' + title) + + +def getAid(Bid): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' + } + url = "https://api.bilibili.com/x/web-interface/view?bvid=" + Bid + print(url) + r = requests.get(url, headers=headers) + j = json.loads(r.text) + # print(j["data"]["aid"]) + print(j) + return j["data"]["aid"] + + +if __name__ == '__main__': + # 用户输入av号或者视频链接地址 + print('*' * 30 + 'B站视频下载小助手' + '*' * 30) + start = input('请输入您要下载的B站av号、bv号或者视频链接地址:') + if 'http' in start: + if 'video/BV' in start: + bv = re.findall(r'video/(.*?)\?', start)[0] + start = str(getAid(bv)) + print(start) + if start.isdigit() == True: # 如果输入的是av号 + # 获取cid的api, 传入aid即可 + start_url = 'https://api.bilibili.com/x/web-interface/view?aid=' + start + else: + # https://www.bilibili.com/video/av46958874/?spm_id_from=333.334.b_63686965665f7265636f6d6d656e64.16 + start_url = 'https://api.bilibili.com/x/web-interface/view?aid=' + re.search(r'/av(\d+)/*', start).group(1) + # https://www.bilibili.com/video/BV1jL4y1e7Uz?t=7.2 + # start_url = 'https://api.bilibili.com/x/web-interface/view?aid=' + re.findall(r'video/(.*?)\?', start)[0] + print(start_url) + # 视频质量 + # + # + # + quality = input('请输入您要下载视频的清晰度(1080p:80;720p:64;480p:32;360p:16)(填写80或64或32或16):') + # 获取视频的cid,title + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' + } + html = requests.get(start_url, headers=headers).json() + data = html['data'] + video_title = data["title"].replace(" ", "_") + cid_list = [] + if '?p=' in start: + # 单独下载分P视频中的一集 + p = re.search(r'\?p=(\d+)', start).group(1) + cid_list.append(data['pages'][int(p) - 1]) + else: + # 如果p不存在就是全集下载 + cid_list = data['pages'] + # print(cid_list) + for item in cid_list: + cid = str(item['cid']) + title = item['part'] + if not title: + title = video_title + title = re.sub(r'[\/\\:*?"<>|]', '', title) # 替换为空的 + print('[下载视频的cid]:' + cid) + print('[下载视频的标题]:' + title) + page = str(item['page']) + start_url = start_url + "/?p=" + page + video_list = get_play_list(start_url, cid, quality) + start_time = time.time() + down_video(video_list, title, start_url, page) + combine_video(video_list, title) + + # 如果是windows系统,下载完成后打开下载目录 + currentVideoPath = os.path.join(sys.path[0], 'bilibili_video') # 当前目录作为下载目录 + if (sys.platform.startswith('win')): + os.startfile(currentVideoPath) + +# 分P视频下载测试: https://www.bilibili.com/video/av19516333/ diff --git a/001-Downloader/test/ff_video.py b/001-Downloader/test/ff_video.py new file mode 100644 index 0000000..45850b7 --- /dev/null +++ b/001-Downloader/test/ff_video.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +""" +@Description: ffmpeg去掉最后一帧,改变md5 +@Date :2022/02/17 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" +import os + + +def cute_video(folder): + files = next(os.walk(folder))[2] # 获取文件 + for file in files: + file_path = os.path.join(folder, file) + shotname, extension = os.path.splitext(file) + if len(shotname) == 0 or len(extension) == 0: + continue + out_file = os.path.join(folder, 'out-{}{}'.format(shotname, extension)) + # 获取时间。输入自己系统安装的ffmpeg,注意斜杠 + time = os.popen( + r"/usr/local/ffmpeg/bin/ffmpeg -i {} 2>&1 | grep 'Duration' | cut -d ' ' -f 4 | sed s/,//".format( + file_path)).read().replace('\n', '').replace(' ', '') + if '.' in time: + match_time = time.split('.')[0] + else: + match_time = time + print(match_time) + ts = match_time.split(':') + sec = int(ts[0]) * 60 * 60 + int(ts[1]) * 60 + int(ts[2]) + # 从0分0秒100毫秒开始截切(目的就是去头去尾) + os.popen(r"/usr/local/ffmpeg/bin/ffmpeg -ss 0:00.100 -i {} -t {} -c:v copy -c:a copy {}".format(file_path, sec, + out_file)) + + +# 主模块执行 +if __name__ == "__main__": + # path = os.path.dirname('/Users/Qincji/Downloads/ffmpeg/') + path = os.path.dirname('需要处理的目录') # 目录下的所有视频 + cute_video(path) diff --git a/001-Downloader/pytube/contrib/__init__.py b/001-Downloader/test/urls.txt old mode 100755 new mode 100644 similarity index 100% rename from 001-Downloader/pytube/contrib/__init__.py rename to 001-Downloader/test/urls.txt diff --git a/001-Downloader/test/xhs_download.py b/001-Downloader/test/xhs_download.py new file mode 100644 index 0000000..554c740 --- /dev/null +++ b/001-Downloader/test/xhs_download.py @@ -0,0 +1,67 @@ +import os +import random +import time + +import requests +from my_fake_useragent import UserAgent + +ua = UserAgent(family='chrome') +pre_save = os.path.join(os.path.curdir, '0216') + +''' + +''' + + +def download_url(url, index): + try: + headers = { + 'Accept': '*/*', + 'Accept-Encoding': 'identity;q=1, *;q=0', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'Cookie': 'xhsTrackerId=6970aca9-a496-4f50-cf98-118929f063bf; timestamp2=2022021544322a4e45f1e1dec93beb82; timestamp2.sig=jk1cFo-zHueSZUpZRvlqyJwTFoA1y8ch9t76Bfy28_Q; solar.beaker.session.id=1644906492328060192125; xhsTracker=url=index&searchengine=google', + 'Host': 'v.xiaohongshu.com', + 'Pragma': 'no-cache', + 'Referer': url, + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36' + } + video = requests.get(url, headers=headers) # 保存视频 + start = time.time() # 下载开始时间 + size = 0 # 初始化已下载大小 + chunk_size = 100 # 每次下载的数据大小 + content_size = int(video.headers['content-length']) # 下载文件总大小 + print(video.status_code) + if video.status_code == 200: # 判断是否响应成功 + print(str(index) + '[文件 大小]:{size:.2f} MB'.format(size=content_size / 1024 / 1024)) # 开始下载,显示下载文件大小 + v_url = os.path.join(pre_save, '{}.mp4'.format(index)) + # v_url = pre_save + '[' + author_list[i] + '].mp4' + with open(v_url, 'wb') as file: # 显示进度条 + for data in video.iter_content(chunk_size=chunk_size): + file.write(data) + size += len(data) + # print('\r' + i + '\n[下载进度]:%s%.2f%%' % ( + # '>' * int(size * 50 / content_size), float(size / content_size * 100))) + end = time.time() # 下载结束时间 + print('\n' + str(index) + '\n[下载完成]:耗时: %.2f秒\n' % (end - start)) # 输出下载用时时间 + except Exception as error: + # Downloader.print_ui2(error) + print(error) + print('该页视频没有' + str(index) + ',已为您跳过\r') + + +if __name__ == '__main__': + ls = [] + if not os.path.exists(pre_save): + os.makedirs(pre_save) + with open('../xhs/urls.txt', 'r') as f: + for line in f: + if 'http' in line: + ls.append(line.replace('\n', '').replace(' ', '')) + size = len(ls) + for i in range(0, size): + url = ls[i] + print('{}-{}'.format(i, url)) + download_url(url, i) + time.sleep(random.randint(5, 10)) diff --git a/001-Downloader/ui.py b/001-Downloader/ui.py index 2ba4967..55d2a2b 100644 --- a/001-Downloader/ui.py +++ b/001-Downloader/ui.py @@ -12,7 +12,6 @@ from douyin.dy_download import DouYin from downloader import Downloader from kuaishou.ks_download import KuaiShou -from pytube import YouTube from type_enum import PrintType from utils import * @@ -31,7 +30,8 @@ def __init__(self, master=None): self.createWidgets() def window_init(self): - self.master.title('欢迎使用-自媒体资源下载器' + Config.instance().get_version_name() + ',本程序仅用于学习交流!如有疑问请联系:xhunmon@gmail.com') + self.master.title( + '欢迎使用-自媒体资源下载器' + Config.instance().get_version_name() + ',本程序仅用于学习交流!如有疑问请联系:xhunmon@gmail.com') self.master.bg = bg_color width, height = self.master.maxsize() # self.master.geometry("{}x{}".format(width, height)) @@ -150,10 +150,7 @@ def start_download(self): url = self.urlEntry.get() path = self.dirEntry.get() domain = get_domain(url) - if "youtube" in domain: - # downloader: Downloader = YouTube(url).streams.first().download() - downloader: Downloader = YouTube(url) - elif "kwaicdn" in domain or "kuaishou" in domain: + if "kwaicdn" in domain or "kuaishou" in domain: downloader: KuaiShou = KuaiShou() # downloader.set_cookie() else: diff --git a/005-PaidSource/.gitignore b/005-PaidSource/.gitignore new file mode 100644 index 0000000..b6e4761 --- /dev/null +++ b/005-PaidSource/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/005-PaidSource/005-PaidSource.iml b/005-PaidSource/005-PaidSource.iml new file mode 100644 index 0000000..ad3c0a3 --- /dev/null +++ b/005-PaidSource/005-PaidSource.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/005-PaidSource/README.md b/005-PaidSource/README.md new file mode 100644 index 0000000..b671552 --- /dev/null +++ b/005-PaidSource/README.md @@ -0,0 +1,47 @@ +# 这些脚本你肯定会有用到的 + +### 操作已打开的chrome浏览器 + +场景:某些情况我们获取怎么都获取不到cookie,但我们可以使用先在浏览器上登录,然后进行自动化操作。 + +操作指南: + +```shell +需要以该方式启动的浏览器: +win: chrome.exe --remote-debugging-port=9222 +mac:/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222& +``` + +实现脚本:[chrome.py](./chrome.py) + +### excel表的常规操作 + +场景:word文档生活使用就不用多说了,学会定能给生活带来很大的便利。 + +操作指南:使用 pandas 开源库实现。 + +实现脚本:① 考勤统计实现 [kaoqin.py](./kaoqin.py) 。②从excel表取数据翻译后重新写入[gtransfer.py](./gtransfer.py) + +### 用ffmpeg批量修改视频的md5值 + +场景:短视频搬运专用。 + +操作指南:需要安装ffmpeg环境。 + +实现脚本:[ff_video.py](./ff_video.py) + +### 文件相关操作:json读写、文件子目录文件获取、html转word等 + +场景:文件的一些操作。 + +操作指南:略。 + +实现脚本:[file_util.py](./file_util.py) + +### 其他站点爬虫与解析 + +场景:注意学会BeautifulSoup解析,取属性值等。 + +操作指南:略。 + +实现脚本:[other_site.py](./other_site.py) \ No newline at end of file diff --git a/005-PaidSource/__init__.py b/005-PaidSource/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/005-PaidSource/chrome.py b/005-PaidSource/chrome.py new file mode 100644 index 0000000..1fe3d62 --- /dev/null +++ b/005-PaidSource/chrome.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +""" +@Description: 用已经打开的chrome浏览器进行自动化操作。 +在某些应用场景我们获取怎么都获取不到cookie,但我们可以使用先在浏览器上登录,然后进行自动化操作。 +这里实现book118.com网站自动化操作。 +@Date :2022/1/14 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" + +import asyncio +import random +import time + +import aiohttp +import requests +from bs4 import BeautifulSoup +from pyppeteer import launcher + +import file_util as futls +from v2ray_pool import Net + +loop = asyncio.get_event_loop() + + +async def get_cookie(page): + """ + 获取cookie + :param:page page对象 + :return:cookies 处理后的cookie + """ + cookie_list = await page.cookies() + cookies = "" + for cookie in cookie_list: + coo = "{}={};".format(cookie.get("name"), cookie.get("value")) + cookies += coo + return cookies + + +async def main(): + async with aiohttp.ClientSession() as session: + try: + async with session.get("http://localhost:9222/json/version") as response: + chrome = await response.json() + browser = await launcher.connect( + defaultViewport=None, + loop=loop, + browserWSEndpoint=chrome['webSocketDebuggerUrl'] + ) + except aiohttp.ClientConnectorError: + print("start chrome --headless --remote-debugging-port=9222 --disable-gpu") + return + # pages = await browser.pages() + page = await browser.newPage() # "通过 Browser 对象创建页面 Page 对象" + await page.goto('https://max.book118.com/user_center_v1/doc/Doclist/trash.html') + table = await page.waitForSelector('#table') + print(table) + content = await page.content() # 获取页面内容 + futls.write(content, 'test/src.html') + agent = await browser.userAgent() + cookies = await get_cookie(page) + print('agent:[%s]' % agent) + print('cookies:[%s]' % cookies) + results = Book118.parse_page(content) + print(results) + print('需要操作的个数:[%d]' % len(results)) + headers = { + 'Cookie': cookies, + 'User-Agent': agent + } + for result in results: + aids = result.get('aids') + title = result.get('title') + Book118.recycling(headers, aids, title) + time.sleep(random.randint(2, 5)) + Book118.recycling_name(headers, aids) + time.sleep(random.randint(2, 5)) + + +class Book118(Net): + '''https://max.book118.com/user_center_v1/doc/index/index.html#trash''' + + @staticmethod + def recycling(headers, aids, title): + data = { + 'aids': aids, + 'is_optimization': 0, + 'title': title, + 'keywords': '', + 'typeid': 481, + 'dirid': 0, + 'is_original': 0, + 'needmoney': random.randint(3, 35), + 'summary': '' + } + url = 'https://max.book118.com/user_center_v1/doc/Api/updateDocument/docListType/recycling' + r = requests.post(url=url, data=data, headers=headers, allow_redirects=False, verify=False, timeout=15, + stream=True) + if r.status_code == 200: + print('修改[%s]成功' % title) + else: + print(r) + raise Exception('[%s]修改失败!' % title) + + @staticmethod + def recycling_name(headers, aids): + data = { + 'aids': aids, + 'reason': '文件名已修复', + 'status': 1 + } + url = 'https://max.book118.com/user_center_v1/doc/Api/recoverDocument/docListType/recycling' + r = requests.post(url=url, data=data, headers=headers, allow_redirects=False, verify=False, + timeout=15, stream=True) + if r.status_code == 200: + print('提交[%s]成功' % aids) + else: + print(r) + raise Exception('[%s]操作失败!' % aids) + + @staticmethod + def load_page(url): + r = requests.get(url=url, allow_redirects=False, verify=False, + timeout=15, stream=True) + r.encoding = r.apparent_encoding + print('url[%s], code[%d]' % (url, r.status_code)) + if r.status_code == 200: + return r.text + return None + + @staticmethod + def parse_page(content): + soup = BeautifulSoup(content, 'html.parser') + tbody = soup.find('tbody') + results = [] + for tr in tbody.find_all('tr'): + if '文档名不规范' in tr.find('td', class_='col-delete-reason').text: + title: str = tr.get_attribute_list('data-title')[0] + if title.endswith('..docx'): + title = title.replace('..docx', '') + aids = tr.get_attribute_list('data-aid')[0] + results.append({'aids': aids, 'title': title}) + return results + + +if __name__ == "__main__": + ''' + 注意:需要以该方式启动的浏览器: + win: chrome.exe --remote-debugging-port=9222 + mac:/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222& + ''' + loop.run_until_complete(main()) diff --git a/005-PaidSource/ff_video.py b/005-PaidSource/ff_video.py new file mode 100644 index 0000000..9178b07 --- /dev/null +++ b/005-PaidSource/ff_video.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +""" +@Description: 使用ffmpeg去掉最后一帧,改变md5。短视频搬运专用 +@Date :2022/02/17 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" +import os + + +def cute_video(folder): + files = next(os.walk(folder))[2] # 获取文件 + for file in files: + file_path = os.path.join(folder, file) + shotname, extension = os.path.splitext(file) + if len(shotname) == 0 or len(extension) == 0: + continue + out_file = os.path.join(folder, 'out-{}{}'.format(shotname, extension)) + # 获取时间。输入自己系统安装的ffmpeg,注意斜杠 + time = os.popen( + r"/usr/local/ffmpeg/bin/ffmpeg -i {} 2>&1 | grep 'Duration' | cut -d ' ' -f 4 | sed s/,//".format( + file_path)).read().replace('\n', '').replace(' ', '') + if '.' in time: + match_time = time.split('.')[0] + else: + match_time = time + print(match_time) + ts = match_time.split(':') + sec = int(ts[0]) * 60 * 60 + int(ts[1]) * 60 + int(ts[2]) + # 从0分0秒100毫秒开始截切(目的就是去头去尾) + os.popen(r"/usr/local/ffmpeg/bin/ffmpeg -ss 0:00.100 -i {} -t {} -c:v copy -c:a copy {}".format(file_path, sec, + out_file)) + + +# 主模块执行 +if __name__ == "__main__": + # path = os.path.dirname('/Users/Qincji/Downloads/ffmpeg/') + path = os.path.dirname('需要处理的目录') # 目录下的所有视频 + cute_video(path) diff --git a/005-PaidSource/file_util.py b/005-PaidSource/file_util.py new file mode 100644 index 0000000..961aa0a --- /dev/null +++ b/005-PaidSource/file_util.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +""" +@Description: 文件相关处理 +@Date :2022/01/22 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" + +import datetime +import json +import os +import re +import shutil + +import cairosvg +import pandas as pd +import pypandoc # 要安装pandoc +from docx import Document + + +def file_name(file_dir): + results = [] + for root, dirs, files in os.walk(file_dir): + # print(root) # 当前目录路径 + # print(dirs) # 当前路径下所有子目录 + # print(files) # 当前路径下所有非目录子文件 + results += files + return results + + +def deal_one_page(): + fs = file_name('100条') + for f in fs: + try: + print('正在检测【%s】' % f) + shotname, extension = os.path.splitext('%s' % f) + print('正在检测【%s】' % shotname) + if '1篇' in shotname: + new_name = re.sub(r'1篇', '', f) + document = Document(r"html/%s" % f) + paragraphs = document.paragraphs + p = paragraphs[0] + p._element.getparent().remove(p._element) + document.save(r"html/%s" % new_name) + os.remove('html/%s' % f) + except Exception as e: + print(e) + + +def copy_doc(): + fs = file_name('all') + i = 1 + k = 1 + temp_dir = '01' + os.makedirs('100条/%s' % temp_dir) + for f in fs: + try: + # print('正在检测【%s】' % f) + shotname, extension = os.path.splitext('%s' % f) + shutil.copyfile(r'all/%s' % f, r'100条/%s/%s' % (temp_dir, f)) + if i % 100 == 0: + temp_dir = '0%d' % k if k < 10 else '%d' % k + k += 1 + os.makedirs('100条/%s' % temp_dir) + i += 1 + except Exception as e: + print(e) + + +'''########文件处理相关#########''' + + +def html_cover_doc(in_path, out_path): + '''将html转化成功doc''' + path, file_name = os.path.split(out_path) + if path and not os.path.exists(path): + os.makedirs(path) + pypandoc.convert_file(in_path, 'docx', outputfile=out_path) + + +def svg_cover_jpg(src, dst): + '''' + drawing = svg2rlg("drawing.svg") + renderPDF.drawToFile(drawing, "drawing.pdf") + renderPM.drawToFile(drawing, "fdrawing.png", fmt="PNG") + renderPM.drawToFile(drawing, "drawing.jpg", fmt="JPG") + ''' + path, file_name = os.path.split(dst) + if path and not os.path.exists(path): + os.makedirs(path) + # drawing = svg2rlg(src) + # renderPM.drawToFile(drawing, dst, fmt="JPG") + cairosvg.svg2png(url=src, write_to=dst) + + +def html_cover_excel(content, out_path): + '''将html转化成excel''' + path, file_name = os.path.split(out_path) + if path and not os.path.exists(path): + os.makedirs(path) + tables = pd.read_html(content, encoding='utf-8') + writer = pd.ExcelWriter(out_path) + for i in range(len(tables)): + tables[i].to_excel(writer, sheet_name='表%d' % (i + 1)) # startrow + writer.save() # 写入硬盘 + + +def write_to_html(content, file_path): + '''将内容写入本地,自动加上head等信息''' + page = ''' + + + + + ''' + page += content + page += ''' + ''' + write(page, file_path) + + +def write_json(content, file_path): + '''写入json''' + path, file_name = os.path.split(file_path) + if path and not os.path.exists(path): + os.makedirs(path) + with open(file_path, 'w') as f: + json.dump(content, f, ensure_ascii=False) + f.close() + + +def read_json(file_path): + '''读取json''' + with open(file_path, 'r') as f: + js_get = json.load(f) + f.close() + return js_get + + +def write(content, file_path): + '''写入txt文本内容''' + path, file_name = os.path.split(file_path) + if path and not os.path.exists(path): + os.makedirs(path) + with open(file_path, 'w') as f: + f.write(content) + f.close() + + +def read(file_path) -> str: + '''读取txt文本内容''' + content = None + try: + with open(file_path, 'r') as f: + content = f.read() + f.close() + except Exception as e: + print(e) + return content + + +def get_next_folder(dst, day_diff, folder, max_size): + '''遍历目录文件,直到文件夹不存在或者数目达到最大(max_size)时,返回路径''' + while True: + day_time = (datetime.date.today() + datetime.timedelta(days=day_diff)).strftime('%Y-%m-%d') # 下一天的目录继续遍历 + folder_path = os.path.join(dst, day_time, folder) + if os.path.exists(folder_path): # 已存在目录 + size = len(next(os.walk(folder_path))[2]) + if size >= max_size: # 该下一个目录了 + day_diff += 1 + continue + else: + os.makedirs(folder_path) + return day_diff, folder_path + + +if __name__ == '__main__': + pass diff --git a/005-PaidSource/gsearch.py b/005-PaidSource/gsearch.py new file mode 100644 index 0000000..daf2102 --- /dev/null +++ b/005-PaidSource/gsearch.py @@ -0,0 +1,216 @@ +import re +import time +import os +from urllib.parse import quote_plus + +import chardet +import requests_html +import pypandoc # 要安装pandoc + +from v2ray_pool import Net +from bs4 import BeautifulSoup +import googlesearch as ggs +import os +import random +import sys +import time +import ssl + +BLACK_DOMAIN = ['www.google.gf', 'www.google.io', 'www.google.com.lc'] +DOMAIN = 'www.google.com' + + +class GSearch(Net): + def search_page(self, url, pause=3): + """ + Google search + :param query: Keyword + :param language: Language + :return: result + """ + time.sleep(random.randint(1, pause)) + try: + r = self.request_en(url) + print('resp code=%d' % r.status_code) + if r.status_code == 200: + charset = chardet.detect(r.content) + content = r.content.decode(charset['encoding']) + return content + elif r.status_code == 301 or r.status_code == 302 or r.status_code == 303: + location = r.headers['Location'] + time.sleep(random.randint(1, pause)) + return self.search_page(location) + # elif r.status_code == 429 or r.status_code == 443: + # time.sleep(3) + # return search_page(url) + return None + except Exception as e: + print(e) + return None + + def parse_html(self, html): + soup = BeautifulSoup(html, 'html.parser') # 声明BeautifulSoup对象 + # find = soup.find('p') # 使用find方法查到第一个p标签 + # print('----->>>>%s' % str(find.text)) + p_s = soup.find_all('p') + results = [] + for p in p_s: + if p.find('img'): # 不要带有图片的标签 + continue + if p.find('a'): # 不要带有链接的标签 + continue + content = str(p) + if "文章来源" in content: + print('过滤[文章来源]>>>>>%s' % content) + continue + if "来源:" in content: + print('过滤[来源:]>>>>>%s' % content) + continue + if len(p.text.replace('\n', '').strip()) < 1: # 过滤空内容 + # print('过滤[空字符]>>>>>!') + continue + results.append(content) + results.append('


') # 隔一下 + return results + # return re.findall(r'()', html, re.DOTALL) + + def get_html(self, url): + session = requests_html.HTMLSession() + html = session.get(url) + html.encoding = html.apparent_encoding + return html.text + + def conver_to_doc(self, in_name, out_name): + try: + pypandoc.convert_file('%s.html' % in_name, 'docx', outputfile="doc/%s.docx" % out_name) + os.remove('%s.html' % in_name) + except Exception as e: + print(e) + + def download_and_merge_page(self, urls, name): + try: + page = [''' + + + + + '''] + k = 0 + size = random.randint(3, 5) # 每次合并成功5篇即可 + for i in range(len(urls)): + if k >= size: + break + try: + temp = self.parse_html(self.get_html(urls[i])) + except Exception as e: + print(e) + continue + if len(temp) < 3: # 篇幅太短 + continue + page.append( + '

第%d篇:

' % ( + k + 1)) # 加入标题 + page += temp + page.append('\n') + k += 1 + page.append('') + with open("%s.html" % name, mode="w") as f: # 写入文件 + for p in page: + f.write(p) + return k + except Exception as e: + print(e) + return 0 + + def get_full_urls(self, html): + a_s = re.findall(r'', html, re.DOTALL) + results = [] + for a in a_s: + try: + # print(a) + # url = re.findall(r'/url\?q=(.*?\.html)', a, re.DOTALL)[0] + url: str = re.findall(r'(http[s]{0,1}://.*?\.html)', a, re.DOTALL)[0] + # title = re.findall(r'(.*?)', a, re.DOTALL)[0] #会有问题 + # print('{"url":"%s","title":"%s"}' % (url, title)) + if 'google.com' in url: + continue + if url in results: + continue + # 过来同一个网站的 + domain = re.findall('http[s]{0,1}://(.*?)/', url, re.DOTALL)[0] + # 含有--的 + if '-' in domain: + continue + # www.sz.gov.cn,'.'超过4个时绝对不行的,像:bbs.jrj.ex3.http.80.ipv6.luzhai.gov.cn + if domain.count('.') > 4: + continue + for u in results: + if domain in u: + continue + results.append(url) + except Exception as e: + # print(e) + pass + return results + + def get_full_titles(self, html): + results = [] + soup = BeautifulSoup(html, "html.parser") + results = [] + for a in soup.find_all(name='a'): + + try: + h3 = a.find(name='h3') + if h3 and h3.has_attr('div'): + div = h3.find(name='div') + results.append(div.getText()) + else: + div = a.find(name='span') + results.append(div.getText()) + + except Exception as e: + print(e) + return results + + def format_common_url(self, search, domain='www.google.com', start=0): + url = 'https://{domain}/search?q={search}&start={start}' + url = url.format(domain=domain, search=quote_plus(search), start=start) + return url + + def format_full_url(self, domain, as_q='', as_epq='', as_oq='', as_eq='', as_nlo='', as_nhi='', lr='', cr='', + as_qdr='', + as_sitesearch='', + as_filetype='', tbs='', start=0, num=10): + """ + https://www.google.com/advanced_search + https://www.google.com/search?as_q=%E8%A1%A3%E6%9C%8D+%E8%A3%A4%E5%AD%90+%E6%9C%8D%E8%A3%85+%E9%A5%B0%E5%93%81+%E7%8F%A0%E5%AE%9D+%E9%93%B6%E9%A5%B0&as_epq=%E5%AE%98%E7%BD%91&as_oq=%E6%9C%8D%E8%A3%85+or+%E9%85%8D%E9%A5%B0&as_eq=%E9%9E%8B%E5%AD%90&as_nlo=20&as_nhi=1000&lr=lang_zh-CN&cr=countryCN&as_qdr=m&as_sitesearch=.com&as_occt=body&safe=active&as_filetype=&tbs= + allintext: 衣服 裤子 服装 饰品 珠宝 银饰 服装 OR or OR 配饰 "官网" -鞋子 site:.com 20..1000 + :param domain: 域名:google.com + :param as_q: 输入重要字词: 砀山鸭梨 + :param as_epq: 用引号将需要完全匹配的字词引起: "鸭梨" + :param as_oq: 在所需字词之间添加 OR: 批发 OR 特价 + :param as_eq: 在不需要的字词前添加一个减号: -山大、-"刺梨" + :param as_nlo: 起点,在数字之间加上两个句号并添加度量单位:0..35 斤、300..500 元、2010..2011 年 + :param as_nhi: 终点,在数字之间加上两个句号并添加度量单位:0..35 斤、300..500 元、2010..2011 年 + :param lr: 查找使用您所选语言的网页。 + :param cr: 查找在特定地区发布的网页。 + :param as_qdr: 查找在指定时间内更新的网页。 + :param as_sitesearch: 搜索某个网站(例如 wikipedia.org ),或将搜索结果限制为特定的域名类型(例如 .edu、.org 或 .gov) + :param as_filetype: 查找采用您指定格式的网页。如:filetype:pdf + :param tbs: 查找可自己随意使用的网页。 + :param start: 第几页,如 90:表示从第9页开始,每一页10条 + :param num: 每一页的条数 + :return: + """ + url = 'https://{domain}/search?as_q={as_q}&as_epq={as_epq}&as_oq={as_oq}&as_eq={as_eq}&as_nlo={as_nlo}&as_nhi={as_nhi}&lr={lr}&cr={cr}&as_qdr={as_qdr}&as_sitesearch={as_sitesearch}&as_occt=body&safe=active&as_filetype={as_filetype}&tbs={tbs}&start={start}&num={num}' + url = url.format(domain=domain, as_q=quote_plus(as_q), as_epq=quote_plus(as_epq), as_oq=quote_plus(as_oq), + as_eq=quote_plus(as_eq), as_nlo=as_nlo, as_nhi=as_nhi, lr=lr, cr=cr, as_qdr=as_qdr, + as_sitesearch=as_sitesearch, start=start, num=num, tbs=tbs, as_filetype=as_filetype) + return url + + +if __name__ == '__main__': + url = 'http://www.sz.gov.cn/cn/zjsz/nj/content/post_1356218.html' + domain: str = re.findall('http[s]{0,1}://(.*?)/', url, re.DOTALL)[0] + print(domain.count('.')) + print(domain) diff --git a/005-PaidSource/gtransfer.py b/005-PaidSource/gtransfer.py new file mode 100644 index 0000000..95434f3 --- /dev/null +++ b/005-PaidSource/gtransfer.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +""" +@Description: 实现从excel文件获取关键词进行翻译后写入新文件 +@Date :2021/10/22 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" + +import json +import os +import os.path +import random +import time + +import chardet +import pandas as pd + +import file_util as futls +import v2ray_util as utils +from v2ray_pool import Net + +BLACK_DOMAIN = ['www.google.gf', 'www.google.io', 'www.google.com.lc'] +DOMAIN = 'www.google.com' + + +class GTransfer(Net): + def search_page(self, url, pause=3): + """ + Google search + :param query: Keyword + :param language: Language + :return: result + """ + time.sleep(random.randint(1, pause)) + try: + r = self.request_en(url) + print('resp code=%d' % r.status_code) + if r.status_code == 200: + charset = chardet.detect(r.content) + content = r.content.decode(charset['encoding']) + return content + elif r.status_code == 301 or r.status_code == 302 or r.status_code == 303: + location = r.headers['Location'] + time.sleep(random.randint(1, pause)) + return self.search_page(location) + return None + except Exception as e: + print(e) + return None + + def transfer(self, content): + # url = 'http://translate.google.com/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl=zh-CN&q=' + content + url = 'http://translate.google.cn/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=en&tl=zh-CN&q=' + content + try: + cache = futls.read_json('data/cache.json') + for c in cache: + if content in c: + print('已存在,跳过:{}'.format(content)) + return c.get(content) + except Exception as e: + pass + try: + result = self.search_page(url) + trans = json.loads(result)['sentences'][0]['trans'] + # 解析获取翻译后的数据 + # print(result) + print(trans) + self.local_cache.append({content: trans}) + futls.write_json(self.local_cache, 'data/cache.json') + # 写入数据吗?下次直接缓存取 + except Exception as e: + print(e) + utils.restart_v2ray() + return self.transfer(content) + return trans + + def init_param(self, file_name): + utils.restart_v2ray() + self.local_cache = [] + # 第一次加载本地的(已翻译的就不再翻译了) + try: + cache = futls.read_json('data/cache.json') + for c in cache: + self.local_cache.append(c) + except Exception as e: + pass + csv_file = os.path.join('data', file_name) + csv_out = os.path.join('data', 'out_' + file_name) + df = pd.read_excel(csv_file, sheet_name='CompetitorWords') + # 代表取出第一行至最后一行,代表取出第四列至最后一列。 + datas = df.values + size = len(df) + print('总共有{}行数据'.format(size)) + titles, titles_zh, keys1, keys2, keys3, pros = [], [], [], [], [], [] + for col in range(0, size): + t = datas[col][0] + titles.append(t) + keys1.append(datas[col][1]) + keys2.append(datas[col][2]) + keys3.append(datas[col][3]) + pros.append(datas[col][4]) + titles_zh.append(self.transfer(t)) + print('总共{},现在到{}'.format(size, col + 1)) + df_write = pd.DataFrame( + {'标题': titles, '中文标题': titles_zh, '关键词1': keys1, '关键词2': keys2, '关键词3': keys3, '橱窗产品': pros}) + df_write.to_excel(csv_out, index=False) + utils.kill_all_v2ray() + + +# http://translate.google.com/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl=zh-CN&q=what + + +if __name__ == '__main__': + g = GTransfer() + g.init_param('xxx.xls') + # utils.search_node() diff --git a/005-PaidSource/kaoqin.py b/005-PaidSource/kaoqin.py new file mode 100644 index 0000000..3870292 --- /dev/null +++ b/005-PaidSource/kaoqin.py @@ -0,0 +1,57 @@ +""" +@Description: excel表的常规操作,这里实现统计考勤 +@Date :2022/02/21 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" + +import pandas as pd +import calendar +from pandas._libs.tslibs.timestamps import Timestamp + + +def get_days(year, month): # 获取输出日期的列明 + dates = calendar.monthrange(year, month) + week = dates[0] # 1号那天是星期几 + days = dates[1] # 总共的天数 + print(dates) + index_time = [] + for day in range(1, days): + index_time.append('{}-{}-{} 星期{}'.format(year, month, day, (week + day) % 7)) + print(index_time) + return index_time + + +def parse_excel(csv_file, out_file, names, dates): + df = pd.read_excel(csv_file, sheet_name='Sheet') # 从文件和表格名称读取 + datas = df.values + size = len(df) + print('总共有{}行数据'.format(size)) + results = {} + for name in names: # 我是根据名字统计 + results.update({name: ['' for x in range(len(dates))]}) # 默认生成每个日期的空格 + for col in range(0, size): + s_name = datas[col][2] # 打印一下就知道去的那是哪里列的值了 + t_time: Timestamp = datas[col][6] # 我这里是时间戳,用type(datas[col][6])打印类型可知 + if s_name not in names: + continue + # 获取这天是哪一天的,name_datas是哪个人对应的列表数据 + d, h, m, name_datas = t_time.day, t_time.hour, t_time.minute, results.get(s_name) + # 早上 9:00前打卡,下午18:00后打卡,取一天最早和最晚的一次即可,门禁可能有很多数据 + tt = '2022-1-{} {}:{}'.format(d, h, m) + old = name_datas[d - 1] # 下标 + if len(old) < 5: # 空的 + name_datas[d - 1] = '{} 早 {};'.format(tt, '' if h < 9 else '异常') # 上班打卡 + else: + # 去除第一个: + first = old.split(';')[0] + last = '{} 晚 {}'.format(tt, '' if h >= 18 else '异常') + name_datas[d - 1] = '{};{}'.format(first, last) + print(results) + df_write = pd.DataFrame(results, index=dates) + df_write.to_excel(out_file, index=True) # 写入输出表格数据 + + +if __name__ == '__main__': + names = ['x1', 'x2', 'x3', 'x4', 'x5'] # 要统计那些人 + parse_excel('data/一月考勤.xls', 'data/out_kaoqin.xls', names, get_days(2022, 1)) diff --git a/005-PaidSource/keywords.py b/005-PaidSource/keywords.py new file mode 100644 index 0000000..98d9071 --- /dev/null +++ b/005-PaidSource/keywords.py @@ -0,0 +1,92 @@ +import json +import random +import re +import time +from urllib.parse import quote_plus + +from v2ray_pool import Net + +import chardet +import requests +import urllib3 +from bs4 import BeautifulSoup +from my_fake_useragent import UserAgent + + +class Keywords(Net): + '''url:https://www.5118.com/ciku/index#129''' + + def get_keys_by_net(self) -> []: + try: + r = self.request(r'https://www.5118.com/ciku/index#129') + if r.status_code != 200: + return None + r.encoding = r.apparent_encoding + # 法律
+ soup = BeautifulSoup(r.text, "html.parser") + results = [] + for a in soup.find_all(name='a'): + results += re.findall(r'(.*?) []: + with open('test/key_tag.json', 'r') as f: + js_get = json.load(f) + f.close() + return js_get + + def get_titles_by_local(self) -> []: + with open('test/key_title.json', 'r') as f: + js_get = json.load(f) + f.close() + return js_get + + def get_titles_by_net(self, key): + '''通过网盘搜索检查出 + https://www.alipanso.com/search.html?page=1&keyword=%E7%90%86%E8%B4%A2&search_folder_or_file=2&is_search_folder_content=1&is_search_path_title=1&category=doc&file_extension=doc&search_model=1 + ''' + results = [] + try: + time.sleep(random.randint(1, 4)) + r = self.request_en( + r'https://www.alipanso.com/search.html?page=1&keyword=%s&search_folder_or_file=2&is_search_folder_content=1&is_search_path_title=1&category=doc&file_extension=doc&search_model=1' % key) + if r.status_code != 200: + print(r.status_code) + return None + r.encoding = r.apparent_encoding + soup = BeautifulSoup(r.text, "html.parser") + for a in soup.find_all(name='a'): + ts = re.findall(r'(.*?).doc', str(a.get_text()).replace('\n', ''), re.DOTALL) + for t in ts: + if '公众号' in t or '【' in t or '[' in t or ',' in t or ',' in t or ')' in t or ')' in t or t in results: + continue + if len(t) < 4: + continue + results.append(t) + return results + except Exception as e: + print(e) + return None + + +def test(): + js = ['a', 'b', 'c'] + with open('test/key_tag.json', 'w') as f: + json.dump(js, f) + f.close() + with open('test/key_tag.json', 'r') as f: + js_get = json.load(f) + f.close() + print(js_get) + + +if __name__ == "__main__": + # test() + keys = Keywords().get_keys_by_net() + print(keys) + with open('test/key_tag.json', 'w') as f: + json.dump(keys, f, ensure_ascii=False) + f.close() diff --git a/005-PaidSource/main.py b/005-PaidSource/main.py new file mode 100644 index 0000000..9ffe9f6 --- /dev/null +++ b/005-PaidSource/main.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +""" +@Description: 关键词获取 +@Date :2021/09/22 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" +# from amazon import run_api +import json +import re + +import yagooglesearch + +import v2ray_util as utils +from gsearch import GSearch +from keywords import Keywords + + +def start_task(): + kd = Keywords() + keywords = kd.get_titles_by_local() + name = 'temp' + gs = GSearch() + i = 0 + is_need_start = True + while i < len(keywords): + if is_need_start: + utils.restart_v2ray() + gs.update_agent() + is_need_start = True + key = keywords[i] + query = 'site:gov.cn filetype:html "%s"' % key + client = yagooglesearch.SearchClient( + query, + tbs="li:1", + max_search_result_urls_to_return=100, + http_429_cool_off_time_in_minutes=49, + http_429_cool_off_factor=1.5, + proxy="socks5h://127.0.0.1:1080", + verbosity=5, + ) + client.assign_random_user_agent() + try: + page_urls = client.search() + except Exception: + continue + new_urls = [] + for u1 in page_urls: + domain: str = re.findall('http[s]{0,1}://(.*?)/', u1, re.DOTALL)[0] + # 含有--的 + if '-' in domain: + continue + # www.sz.gov.cn,'.'超过4个时绝对不行的,像:bbs.jrj.ex3.http.80.ipv6.luzhai.gov.cn + if domain.count('.') > 4: + continue + for u2 in new_urls: + if domain in u2: + continue + new_urls.append(u2) + print('过滤器链接数:%d, 过滤后链接数:%d' % (len(page_urls), len(new_urls))) + page_size = len(new_urls) + if page_size == 0: + print('[%s]获取文章链接失败!' % key) + continue + if not gs.download_and_merge_page(page_urls, name): # 合并文章 + print('下载或者合并失败') + continue + doc_name = '%d篇%s' % (page_size, key) + gs.conver_to_doc(name, doc_name) + is_need_start = False + i += 1 + + utils.kill_all_v2ray() + + +def start_proxy_task(): + kd = Keywords() + keywords: [] = kd.get_titles_by_local() + name = 'temp' + gs = GSearch() + key_s = [] + key_s.pop() + + +def start_task2(): + kd = Keywords() + keywords: [] = kd.get_titles_by_local() + name = 'temp' + gs = GSearch() + i = 0 + is_need_start = True + key_size = len(keywords) + key = keywords.pop() + while i < key_size: + if is_need_start: + utils.restart_v2ray() + gs.update_agent() + else: + key = keywords.pop() + is_need_start = True + # key_url = gs.format_full_url(domain='google.com', as_sitesearch='.gov.cn', as_filetype='html', as_epq=key, + # lr='lang_zh-CN', cr='countryCN') + # key_url = gs.format_full_url(domain='search.iwiki.uk', as_sitesearch='gov.cn', as_filetype='html', as_epq=key, + # lr='lang_zh-CN', cr='countryCN') + # key_url = gs.format_common_url('site:gov.cn filetype:html %s' % key, domain='search.iwiki.uk') + key_url = gs.format_common_url('site:gov.cn intitle:%s' % key, domain='www.google.com') + print(key_url) + content = gs.search_page(key_url) + if content is None: + print('[%s]搜索失败,进行重试!' % key) + continue + with open('test/test_search.html', 'w') as f: + f.write(content) + f.close() + page_urls = gs.get_full_urls(content) # 获取文章的url + page_size = len(page_urls) + if page_size == 0: + print('[%s]没有内容,下一个...' % key) + else: + size = gs.download_and_merge_page(page_urls, name) + if size == 0: # 合并文章 + print('下载或者合并失败,跳过!') + else: + doc_name = '%d篇%s' % (size, key) + gs.conver_to_doc(name, doc_name) + print('生成[%s]文章成功!!!' % doc_name) + is_need_start = False + i += 1 + # 重新覆盖本地关键词 + with open('test/key_title.json', 'w') as f: + json.dump(keywords, f, ensure_ascii=False) + f.close() + + utils.kill_all_v2ray() + + +def test_titles(): + kd = Keywords() + keywords = kd.get_titles_by_local() + print('总共需要加载%d个关键词' % len(keywords)) + + +def test_task(): + kd = Keywords() + keywords = kd.get_keys_by_local() + print('总共需要加载%d个关键词' % len(keywords)) + # keywords = ['股市基金'] + i = 0 + search_keys = [] + utils.restart_v2ray() # 第一次用固定agent + is_need_start = False + while i < len(keywords): + if is_need_start: + utils.restart_v2ray() + # kd.update_agent() + is_need_start = True + key = keywords[i] + print('开始搜索:%s' % key) + titles = kd.get_titles_by_net(key) + if titles is None: + print('[%s]获取关键词标题失败!' % key) + continue + print(titles) + for t in titles: + if t not in search_keys: + search_keys.append(t) + # 每次都要更新一次 + with open('test/key_title.json', 'w') as f: + json.dump(search_keys, f, ensure_ascii=False) + f.close() + is_need_start = False + i += 1 + + utils.kill_all_v2ray() + + +def test_get_title(): + with open('search_page.html', 'r') as f: + page = f.read() + f.close() + gs = GSearch() + titles = gs.get_full_titles(page) # 获取文章的标题 + print(titles) + + +if __name__ == "__main__": + # utils.restart_v2ray() + utils.search_node() + # utils.kill_all_v2ray() diff --git a/005-PaidSource/other_site.py b/005-PaidSource/other_site.py new file mode 100644 index 0000000..09c3cbd --- /dev/null +++ b/005-PaidSource/other_site.py @@ -0,0 +1,574 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +""" +@Description: 获取其他站点信息爬虫 +@Date :2022/1/14 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" +import os +import random +import re +import time + +import requests +from bs4 import BeautifulSoup + +import file_util as futls +import v2ray_util as utils +from v2ray_pool import Net + + +class Cncic(Net): + '''中华全国商业中心:https://www.cncic.org/''' + + def start_task(self): + utils.restart_v2ray() + cncic = Cncic() + keys = [{'cat': 92, 'name': '专题分析报告'}, {'cat': 95, 'name': '政策法规'}, {'cat': 8, 'name': '月度分析'}, + {'cat': 10, 'name': '黄金周分析'}, {'cat': 16, 'name': '零售百强'}, {'cat': 94, 'name': '市场观察'}, ] + success_datas = [] + for key in keys: + cat = key.get('cat') + name = key.get('name') + datas = cncic.load_list(cat) + while len(datas) == 0: + utils.restart_v2ray() + datas = cncic.load_list(cat) + success_datas.append({'name': name, 'data': datas}) + futls.write_json(success_datas, 'data/cncic/keys.json') # 每次保存到本地 + + success_datas = futls.read_json('data/cncic/keys.json') + key_size = len(success_datas) + is_need_start = False + key = None + for i in range(key_size): + if is_need_start: + utils.restart_v2ray() + else: + key = success_datas.pop() + if key is None: + key = success_datas.pop() + is_need_start = True + folder = key.get('name') + datas = key.get('data') + for data in datas: + try: + load_page = cncic.load_page(data.get('url')) + except Exception as e: + print(e) + continue + title, content = cncic.parse_page(load_page) + html_path = 'data/html/cncic/%s/%s.html' % (folder, title) + doc_path = 'data/doc/cncic/%s/%s.docx' % (folder, title) + futls.write_to_html(content, html_path) + try: + futls.html_cover_doc(html_path, doc_path) + except Exception as e: + print(e) + futls.write_json(success_datas, 'data/cncic/keys.json') # 更新本地数据库 + is_need_start = False + i += 1 + utils.kill_all_v2ray() + + def load_list(self, cat, paged=1) -> []: + results = [] + while True: + url = 'https://www.cncic.org/?cat=%d&paged=%d' % (cat, paged) + try: + page = self.load_page(url) + results += self.parse_list(page) + paged += 1 + time.sleep(random.randint(3, 6)) + except Exception as e: + print(e) + break + return results + + def load_page(self, url): + '''加载页面,如:https://www.cncic.org/?p=3823''' + r = self.request_zh(url) + r.encoding = r.apparent_encoding + print('Cncic[%s] code[%d]' % (url, r.status_code)) + if r.status_code == 200: + return r.text + elif r.status_code == 301 or r.status_code == 302 or r.status_code == 303: + location = r.headers['Location'] + time.sleep(1) + return self.load_page(location) + return None + + def parse_page(self, page): + '''解析页面,返回标题和文章页面内容,如果生成文章则还需要组装''' + soup = BeautifulSoup(page, 'html.parser') + article = soup.find('article') + header = article.find('header') + title = header.text.replace('\n', '').replace(' ', '') + content = article.find('div', class_='single-content') + result = str(header) + result += str(content) + return title, result + + def parse_list(self, page): + '''解析列表(如:https://www.cncic.org/?cat=92)页面,返回标题、连接、日期''' + soup = BeautifulSoup(page, 'html.parser') + main = soup.find('main') + articles = main.find_all('article') + results = [] + for article in articles: + header = article.find('header') + url = header.find('a').get_attribute_list('href')[0] + title = header.text.replace('\n', '').replace(' ', '') + date = article.find('span', class_='date').text + results.append({'url': url, 'title': title, 'date': date}) + return results + + +class Ceicdata(Net): + '''https://www.ceicdata.com/''' + + def start_task_1(self): + cd = Ceicdata() + utils.restart_v2ray() + success_datas = futls.read_json('data/keys/ceicdata.json') + key_size = len(success_datas) + print(key_size) + is_need_start = False + key = None + for i in range(key_size): + if is_need_start: + utils.restart_v2ray() + cd.update_agent() + else: + key = success_datas.pop() + is_need_start = True + if key is None: + key = success_datas.pop() + title = key.get('title') + url = key.get('url') + try: + page = cd.load_page(url) + except Exception as e: + page = None + print(e) + if page is None: + continue + html_path = 'data/html/ceicdata/%s.html' % title + content = cd.parse_page_1(page) + futls.write_to_html(content, html_path) + futls.html_cover_excel(html_path, 'data/doc/ceicdata/%s.xlsx' % title) + futls.write_json(success_datas, 'data/keys/ceicdata.json') # 更新本地数据库 + is_need_start = False + i += 1 + utils.kill_all_v2ray() + + @staticmethod + def start_task2(): + utils.restart_v2ray() + cd = Ceicdata() + keys_path = 'data/keys/ceicdata.json' + keys = futls.read_json(keys_path) + if not keys: + url = 'https://www.ceicdata.com/zh-hans/country/china' + page = cd.load_page(url) + if page is None: + raise Exception('获取页面失败') + keys = cd.parse_main_2(page) + if len(keys) == 0: + raise Exception('获取链接失败') + futls.write_json(keys, keys_path) + key_size = len(keys) + print('下载数量[%d]' % key_size) + is_need_start = False + key = None + for i in range(key_size): + if is_need_start: + utils.restart_v2ray() + cd.update_agent() + else: + key = keys.pop() + is_need_start = True + if key is None: + key = keys.pop() + url = key.get('url') + try: + page = cd.load_page(url) + except Exception as e: + page = None + print(e) + if page is None: + continue + try: + title, content = cd.parse_page_2(page) + except Exception as e: + print(e) + continue + html_path = 'data/html/ceicdata2/%s.html' % title + futls.write_to_html(content, html_path) + futls.html_cover_doc(html_path, 'data/doc/ceicdata/%s.docx' % title) + futls.write_json(keys, keys_path) # 更新本地数据库 + is_need_start = False + i += 1 + utils.kill_all_v2ray() + + def load_page(self, url): + '''加载页面,如:https://www.cncic.org/?p=3823''' + r = self.request_en(url) + # r = self.request(url) + r.encoding = r.apparent_encoding + print('Cncic[%s] code[%d]' % (url, r.status_code)) + if r.status_code == 200: + return r.text + elif r.status_code == 301 or r.status_code == 302 or r.status_code == 303: + location = r.headers['Location'] + time.sleep(1) + return self.load_page(location) + return None + + def parse_main_1(self, page): + '''解析页面,返回标题和文章页面内容,如果生成文章则还需要组装''' + soup = BeautifulSoup(page, 'html.parser') + main = soup.find('main') + lists = main.find('div', class_='indicators-lists') + results = [] + for a in lists.find_all('a'): + # https://www.ceicdata.com/zh-hans/indicator/nominal-gdp + title = a.text.replace(' ', '') + url = 'https://www.ceicdata.com' + a.get_attribute_list('href')[0].replace(' ', '') + results.append({'title': title, 'url': url}) + return results + + def parse_main_2(self, page): + '''解析页面,返回标题和文章页面内容,如果生成文章则还需要组装''' + soup = BeautifulSoup(page, 'html.parser') + main = soup.find('main') + results = [] + for tbody in main.find_all('tbody'): + for a in tbody.find_all('a'): + # https://www.ceicdata.com/zh-hans/indicator/nominal-gdp + title = a.text.replace(' ', '') + url = 'https://www.ceicdata.com' + a.get_attribute_list('href')[0].replace(' ', '') + results.append({'title': title, 'url': url}) + return results + + def parse_page_1(self, page): + '''解析页面,返回标题和文章页面内容,如果生成文章则还需要组装''' + soup = BeautifulSoup(page, 'html.parser') + main = soup.find('main') + clearfix = main.find('div', class_='clearfix') + h1 = clearfix.find('h1') + h2 = clearfix.find('h2') + tables = clearfix.find_all('table') + content = str(h1) + str(tables[0]) + str(h2) + str(tables[1]) + return content + + def parse_page_2(self, page): + '''解析页面,返回标题和文章页面内容,如果生成文章则还需要组装''' + soup = BeautifulSoup(page, 'html.parser') + main = soup.find('main') + left = main.find('div', id='left-col-7') + title = left.find('span', class_='c-purple').text.replace(' ', '').replace('\n', '') + left.find('div', id='breadcrumb').decompose() # 移除节点 + for ele in left.find_all('div', class_='hide'): + ele.decompose() # 移除节点 + for ele in left.find_all('div', class_='div-chart-btns'): + ele.decompose() # 移除节点 + for ele in left.find_all('div', class_='table-buy'): + ele.decompose() # 移除节点 + for ele in left.find_all('div', class_='div-bgr-2'): + if '查看价格选项' in str(ele): + ele.decompose() # 移除节点 + for ele in left.find_all('h4'): + if '购买' in str(ele): + ele.decompose() # 移除节点 + for ele in left.find_all('button'): + if '加载更多' in str(ele): + ele.decompose() # 移除节点 + for ele in left.find_all('div', class_='div-bgr-1'): + if '详细了解我们' in str(ele): + ele.decompose() # 移除节点 + i = 1 + for img in left.find_all('img'): + src = str(img.get('src')) + path = '/Users/Qincji/Desktop/develop/py/project/PythonIsTools/005-PaidSource/data/img/%d.svg' % i + dst = '/Users/Qincji/Desktop/develop/py/project/PythonIsTools/005-PaidSource/data/img/%d.png' % i + if 'www.ceicdata.com' in src: + print('下载图片url[%s]' % src) + r = self.request_zh(src) + # r = self.request(src) + if r.status_code == 200: + with open(path, 'wb') as f: + f.write(r.content) + futls.svg_cover_jpg(path, dst) # 将svg转换成jpg + img['src'] = dst + i += 1 + else: + raise Exception('下载图片失败!') + rs = re.sub(r'href=".*?"', '', str(left)) # 移除href + return title, rs + + +class Cnnic(Net): + '''http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/ , 注意:因为文件过大,使用别人代理下载,固定代理''' + + @staticmethod + def start_task(): + cnnic = Cnnic() + keys_path = 'data/keys/cnnic.json' + all_keys = futls.read_json(keys_path) + if not all_keys: + all_keys = [] + for i in range(7): + if i == 0: + url = 'http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/index.htm' + else: + url = 'http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/index_%d.htm' % i + page = cnnic.load_page(url) + if page: + futls.write(page, 'test/src.html') + all_keys += cnnic.parse_page(page) + futls.write_json(all_keys, keys_path) + size = len(all_keys) + print('将要下载数量[%d]' % size) + for i in range(size): + key = all_keys.pop() + name = key.get('title') + url = key.get('url') + path = 'data/doc/cnnic/%s.pdf' % name + cnnic.download(url, path) + futls.write_json(all_keys, keys_path) + print('已下载[%d] | 还剩[%d]' % (i + 1, size - i - 1)) + + def load_page(self, url): + time.sleep(3) + # r = self.request_en(url) + # r = self.request(url) + proxies = {'http': 'http://11.0.222.4:80', 'https': 'http://11.0.222.4:80'} + r = requests.get(url=url, headers=self._headers, allow_redirects=False, verify=False, + proxies=proxies, timeout=15) + r.encoding = r.apparent_encoding + print('Cnnic[%s] code[%d]' % (url, r.status_code)) + if r.status_code == 200: + return r.text + elif r.status_code == 301 or r.status_code == 302 or r.status_code == 303: + location = r.headers['Location'] + time.sleep(1) + return self.load_page(location) + return None + + def parse_page(self, page): + '''解析页面,返回标题和文章页面内容,如果生成文章则还需要组装''' + soup = BeautifulSoup(page, 'html.parser') + content = soup.find('div', class_='content') + results = [] + for li in content.find_all('li'): + a = li.find('a') + date = li.find('div', class_='date').text[0:4] # 只要年份 + title = a.text.replace('\n', '').replace(' ', '') + # http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/hlwtjbg/202109/P020210915523670981527.pdf + # ./hlwtjbg/202109/P020210915523670981527.pdf + url = 'http://www.cnnic.net.cn/hlwfzyj/hlwxzbg/' + str(a.get('href')).replace('./', '') + if not '年' in title: + title = '%s年发布%s' % (date, title) + results.append({'title': title, 'url': url}) + return results + + def download(self, url, path): + if os.path.exists(path): + os.remove(path) + proxies = {'http': 'http://11.0.222.4:80', 'https': 'http://11.0.222.4:80'} + r = requests.get(url=url, headers=self._headers, allow_redirects=False, verify=False, + proxies=proxies, timeout=15, stream=True) + i = 0 + print('name[%s]|code[%d]' % (path, r.status_code)) + with open(path, "wb") as pdf: + for chunk in r.iter_content(chunk_size=1024): + if chunk: + i += 1 + if i % 30 == 0: + print('.', end='') + pdf.write(chunk) + pdf.close() + time.sleep(random.randint(3, 12)) + + +class Othersite(Net): + def __init__(self): + super(Othersite, self).__init__() + self.dir = 'Othersite' + + @staticmethod + def start_task(): + reqs = [{'title': 'xxx', 'url': 'https://www.xxx.com/wedding/engagement-rings.html'}] + # utils.restart_v2ray() + sl = Othersite() + datas = futls.read_json(os.path.join('Othersite', 'page_urls.json')) + if datas is None: + datas = [] + for req in reqs: + title = req.get('title') + url = req.get('url') + has_write = False + for data in datas: + if title in data.get('title'): + has_write = True + break + if has_write: # 已经请求过了 + print('页面连接 [%s]已存在,跳过!' % title) + continue + start = 1 + page_urs = [] + while True: + if start != 1: + temp = url + '?p=' + str(start) + else: + temp = url + try: + page = sl.load_page(temp) + except Exception as e: + print(e) + print('--------000') + utils.restart_v2ray() + continue + futls.write(page, 'test/src.html') + has_next, results = sl.parse_list(page) + print('has_next: {} | {}'.format(has_next, results)) + page_urs += results + if not has_next: + break + start += 1 + # page = futls.read('test/src.html') + # print(sl.parse_details(page)) + datas.append({'title': title, 'urls': page_urs}) + futls.write_json(datas, os.path.join('Othersite', 'page_urls.json')) + all_results = [] # 总数据表 + size = len(datas) + alls_local = futls.read_json(os.path.join('Othersite', 'all.json')) + for i in range(size): + data = datas.pop() + title = data.get('title') + page_urs = data.get('urls') + has_write_all = False + # for local in alls_local: + # if title in local.get('title'): + # has_write_all = True + # break + # if has_write_all: + # print('[%s]已下载,跳过!' % title) + # continue + sl.dir = os.path.join('Othersite', title) + url_size = len(page_urs) + print('下载数量[%d]' % url_size) + is_need_start = False + url = None + results = [] + for i in range(url_size): + if is_need_start: + utils.restart_v2ray() + sl.update_agent() + else: + url = page_urs.pop() + is_need_start = True + if url is None: + url = page_urs.pop() + # 本地是否也已经存在 + sku1 = url[url.rfind('-') + 1:].replace('.html', '').upper() + if os.path.exists(os.path.join(sl.dir, sku1)): + is_need_start = False + print('ksu [%s]已存在,跳过!' % sku1) + continue + try: + page = sl.load_page(url) + sku = sl.parse_details(page) + results.append(sku) + is_need_start = False + except Exception as e: + print(e) + print('--------333') + all_results.append({'title': title, 'skus': results}) + futls.write_json(all_results, os.path.join('Othersite', 'all.json')) + futls.write_json(datas, os.path.join('Othersite', 'page_urls.json')) + utils.kill_all_v2ray() + + def load_page(self, url): + '''加载页面,如:https://www.cncic.org/?p=3823''' + time.sleep(random.randint(3, 8)) + r = self.request_en(url) + # r = self.request(url) + r.encoding = r.apparent_encoding + print('Othersite code[%d] |url [%s] ' % (r.status_code, url)) + if r.status_code == 200: + return r.text + elif r.status_code == 301 or r.status_code == 302 or r.status_code == 303: + location = r.headers['Location'] + time.sleep(1) + return self.load_page(location) + return None + + def parse_home(self, page): + soup = BeautifulSoup(page, 'html.parser') + nav = soup.find('nav') + results = [] + for a in nav.find_all('a'): + results.append({'title': a.text, 'url': str(a.get('href'))}) + return results + + def parse_list(self, page): + soup = BeautifulSoup(page, 'html.parser') + ol = soup.find('ol') + # 判断是否还有下一页 + next = False + pages = soup.find('div', class_='pages') + if pages: + pages_n = pages.find('li', class_='pages-item-next') + if pages_n: + next = True + results = [] + for a in ol.find_all('a'): + results.append(str(a.get('href'))) + return next, results + + def parse_details(self, page): + soup = BeautifulSoup(page, 'html.parser') + # content = soup.find('div', class_='content') + # main = content.find('main') + right = soup.find('div', class_='product-info-main') + title = right.find('h1').text.replace('Othersite ', '') + sku = right.find('div', class_='value').text + try: + price = right.find('span', id='price-saved').find('span').text + except Exception: + print('没有折扣,继续找..') + price = right.find('span', class_='special-price').find('span', class_='price').text + # 下载图片 + layout = soup.find('amp-layout') + carousel = layout.find('amp-carousel') + imgs = [] + i = 0 + content = '{}\n{}'.format(sku, title) + for img in carousel.find_all('amp-img'): + src = str(img.get('src')) + imgs.append(src) + path = self.download_img(src, sku, i) + content = content + '\n' + path + i += 1 + futls.write(content, os.path.join(self.dir, sku, '{}.txt'.format(sku))) + return {'sku': sku, 'title': title, 'price': price, 'imgs': imgs} + + def download_img(self, src, sku, i): + path = os.path.join(self.dir, sku, '{}-{}.jpg'.format(sku, i)) + pre_path, file_name = os.path.split(path) + if pre_path and not os.path.exists(pre_path): + os.makedirs(pre_path) + time.sleep(random.randint(1, 2)) + r = self.request_en(src) + if r.status_code == 200: + with open(path, 'wb') as f: + f.write(r.content) + else: + raise Exception('下载图片失败!') + return path + + +if __name__ == "__main__": + pass diff --git a/005-PaidSource/v2ray_pool/__init__.py b/005-PaidSource/v2ray_pool/__init__.py new file mode 100644 index 0000000..7a4ce9d --- /dev/null +++ b/005-PaidSource/v2ray_pool/__init__.py @@ -0,0 +1,13 @@ +# 运行时路径。并非__init__.py的路径 +import os +import sys + +BASE_DIR = "../002-V2rayPool" +if os.path.exists(BASE_DIR): + sys.path.append(BASE_DIR) + +from core import utils +from core.conf import Config +from core.client import Creator +from db.db_main import DBManage +from base.net_proxy import Net \ No newline at end of file diff --git a/005-PaidSource/v2ray_pool/_db-checked.txt b/005-PaidSource/v2ray_pool/_db-checked.txt new file mode 100644 index 0000000..c9f9c42 --- /dev/null +++ b/005-PaidSource/v2ray_pool/_db-checked.txt @@ -0,0 +1,10 @@ +ss://YWVzLTI1Ni1nY206ZzVNZUQ2RnQzQ1dsSklk@172.99.190.39:5003#github.com/freefq%20-%20%E7%BE%8E%E5%9B%BD%20%2010,172.99.190.39,美国 康涅狄格 +ss://YWVzLTI1Ni1nY206ZTRGQ1dyZ3BramkzUVk@172.99.190.205:9101#github.com/freefq%20-%20%E7%BE%8E%E5%9B%BD%20%2015,172.99.190.205,美国 康涅狄格 +ss://YWVzLTI1Ni1nY206ZmFCQW9ENTRrODdVSkc3QDE0NS4yMzkuMS4xMDA6MjM3NQ==#github.com%2Ffreefq+-+%E8%8B%B1%E5%9B%BD++18,145.239.1.100,德国 Hessen +ss://Y2hhY2hhMjAtaWV0Zi1wb2x5MTMwNTpzRjQzWHQyZ09OcWNnRlg1NjNAMTQxLjk1LjAuMjY6ODI2#github.com%2Ffreefq+-+%E8%8B%B1%E5%9B%BD++13,54.38.217.138,美国 新泽西 +ss://YWVzLTI1Ni1nY206WTZSOXBBdHZ4eHptR0NAMTcyLjk5LjE5MC4xNDk6MzMwNg==#github.com%2Ffreefq+-+%E7%BE%8E%E5%9B%BD++14,172.99.190.149,美国 康涅狄格 +ss://YWVzLTI1Ni1nY206ZmFCQW9ENTRrODdVSkc3QDE3Mi45OS4xOTAuMTg4OjIzNzY=#github.com%2Ffreefq+-+%E7%BE%8E%E5%9B%BD++11,172.99.190.188,美国 康涅狄格 +ss://YWVzLTI1Ni1nY206UENubkg2U1FTbmZvUzI3QDEzNC4xOTUuMTk2LjE0Nzo4MDkw#github.com%2Ffreefq+-+%E5%8C%97%E7%BE%8E%E5%9C%B0%E5%8C%BA++9,134.195.196.147,美国 +ss://YWVzLTI1Ni1nY206ekROVmVkUkZQUWV4Rzl2QDE2OS4xOTcuMTQxLjkxOjYzNzk=#github.com%2Ffreefq+-+%E5%8C%97%E7%BE%8E%E5%9C%B0%E5%8C%BA++22,169.197.141.91,美国 佐治亚 亚特兰大 +ss://YWVzLTI1Ni1nY206ZmFCQW9ENTRrODdVSkc3QDEzNC4xOTUuMTk2LjE5MzoyMzc2#github.com%2Ffreefq+-+%E5%8C%97%E7%BE%8E%E5%9C%B0%E5%8C%BA++15,134.195.196.193,美国 +ss://YWVzLTI1Ni1nY206ZzVNZUQ2RnQzQ1dsSklkQDEzNC4xOTUuMTk2LjIwMDo1MDAz#github.com%2Ffreefq+-+%E5%8C%97%E7%BE%8E%E5%9C%B0%E5%8C%BA++10,134.195.196.200,美国 diff --git a/005-PaidSource/v2ray_pool/_db-uncheck.txt b/005-PaidSource/v2ray_pool/_db-uncheck.txt new file mode 100644 index 0000000..e69de29 diff --git a/005-PaidSource/v2ray_util.py b/005-PaidSource/v2ray_util.py new file mode 100644 index 0000000..a7308b7 --- /dev/null +++ b/005-PaidSource/v2ray_util.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +""" +@Description: 管理v2ray_pool的工具 +@Date :2022/1/14 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" + +import time + +from v2ray_pool import utils, Config, DBManage + + +def search_node(): + # 如果有系统全局代理,可不需要开启v2ray_core代理,GoogleTrend(proxies=False) + utils.kill_all_v2ray() + Config.set_v2ray_core_path('/Users/Qincji/Desktop/develop/soft/intalled/v2ray-macos-64') # v2ray内核存放路径 + Config.set_v2ray_node_path( + '/Users/Qincji/Desktop/develop/py/project/PythonIsTools/005-PaidSource/v2ray_pool') # 保存获取到节点的路径 + proxy_url = 'ss://YWVzLTI1Ni1nY206UENubkg2U1FTbmZvUzI3@145.239.1.137:8091#github.com/freefq%20-%20%E8%8B%B1%E5%9B%BD%20%207' + dbm = DBManage() + dbm.init() # 必须初始化 + # if dbm.check_url_single(proxy_url): + # urls = dbm.load_urls_by_net(proxy_url=proxy_url) + # dbm.check_and_save(urls, append=False) + dbm.load_urls_and_save_auto() + # urls = dbm.load_unchecked_urls_by_local() + # dbm.check_and_save(urls, append=False) + utils.kill_all_v2ray() + + +def restart_v2ray(isSysOn=False): + utils.kill_all_v2ray() + Config.set_v2ray_core_path('/Users/Qincji/Desktop/develop/soft/intalled/v2ray-macos-64') # v2ray内核存放路径 + Config.set_v2ray_node_path( + '/Users/Qincji/Desktop/develop/py/project/PythonIsTools/005-PaidSource/v2ray_pool') # 保存获取到节点的路径 + dbm = DBManage() + dbm.init() # 必须初始化 + while 1: + if dbm.start_random_v2ray_by_local(isSysOn=isSysOn): + break + else: + print("启动失败,进行重试!") + time.sleep(1) + + +def kill_all_v2ray(): + utils.kill_all_v2ray() diff --git a/006-TikTok/.gitignore b/006-TikTok/.gitignore new file mode 100644 index 0000000..b6e4761 --- /dev/null +++ b/006-TikTok/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/006-TikTok/006-TikTok.iml b/006-TikTok/006-TikTok.iml new file mode 100644 index 0000000..f4e6189 --- /dev/null +++ b/006-TikTok/006-TikTok.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/006-TikTok/README.md b/006-TikTok/README.md new file mode 100644 index 0000000..ede0e21 --- /dev/null +++ b/006-TikTok/README.md @@ -0,0 +1,12 @@ +# App自动化 + +## 效果 + +抖音和tiktok能自动刷App评论。 + +## 实现 +1. 使用android手机,能与电脑正常使用adb连接 +2. 使用[uiautomator2](https://github.com/openatx/uiautomator2) 开源库。 +3. 借助:weditor 来获取元素(xpath等)。 + +> 其他:注意需要把电脑要把代理关掉 \ No newline at end of file diff --git a/006-TikTok/__init__.py b/006-TikTok/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/006-TikTok/dy_review.py b/006-TikTok/dy_review.py new file mode 100644 index 0000000..c1af3c2 --- /dev/null +++ b/006-TikTok/dy_review.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +""" +@Description: 抖音app刷评论 +@Date :2021/12/22 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" +import random +import time + +import uiautomator2 as u2 + +d = u2.connect() +d.implicitly_wait(80) + + +def review_douyin(): # 评论 + d.press("home") + d.app_start('com.ss.android.ugc.aweme', stop=True) + time.sleep(1) + d(resourceId="com.ss.android.ugc.aweme:id/foj").click() # 点击搜索 + time.sleep(1) + d(resourceId="com.ss.android.ugc.aweme:id/et_search_kw").click() # 点击输入框,预防键盘弹不起来 + keys = ['元宵节创意视频'] # , '情人节', '搞笑视频', '真人动漫特效' + comments = ['[比心]', '[强壮]', '[击掌]', '[给力]', '[爱心]', '[派对]', '[不看]', '[炸弹]', '[憨笑]', '[悠闲]', '[嘿哈]', '[西瓜]', '[咖啡]', + '[太阳]', '[月亮]', '[发]', '[红包]', '[拳头]', '[勾引]', '[胜利]', '[抱拳]', '[左边]', '[送心]', '[来看我]', '[来看我]', + '[来看我]', '[灵机一动]', '[耶]', '[色]', '[震惊]', '[小鼓掌]', '[发呆]', '[偷笑]', '[石化]', '[思考]', '[笑哭]', '[奸笑]', + '[坏笑]', '[得意]', '[钱]', '[亲亲]', '[愉快]', '[玫瑰]', '[赞]', '[鼓掌]', '[感谢]', '[666]', '[胡瓜]', '[啤酒]', '[飞吻]', + '[紫薇别走]', '[听歌]', '[绝望的凝视]', '[不失礼貌的微笑]', '[吐舌]', '[呆无辜]', '[看]', '[熊吉]', '[黑脸]', '[吃瓜群众]', '[呲牙]', + '[绿帽子]', '[摸头]', '[皱眉]', '[OK]', '[碰拳]', '[强壮]', '[比心]', '[吐彩虹]', '[奋斗]', '[敲打]', '[惊喜]', '[如花]', '[强]', + '[做鬼脸]', '[尬笑]', '[红脸]', '牛啊', '牛啊牛', 'nb', '666', '赞一个', '赞', '棒', '学到了', '1', '已阅', '板凳', + '插一楼:变戏法的亮手帕', '插一楼:狗吃豆腐脑', '插一楼:癞蛤蟆打伞', '插一楼:离了水晶宫的龙', '插一楼:盲人聊天', '插一楼:五百钱分两下', '插一楼:盲公戴眼镜', + '插一楼:王八倒立', '插一楼:癞蛤蟆背小手', '插一楼:韩湘子吹笛', '插一楼:剥了皮的蛤蟆', '插一楼:马蜂蜇秃子', '插一楼:冷水烫鸡', '插一楼:老孔雀开屏', '插一楼:大姑娘养的', + '插一楼:三角坟地', '插一楼:蜡人玩火', '插一楼:发了霉的葡萄', '插一楼:厥着看天', '插一楼:种地不出苗', '插一楼:老虎头上的苍蝇', '插一楼:雷婆找龙王谈心', + '插一楼:菩萨的胸怀', '插一楼:牛屎虫搬家', '插一楼:变戏法的拿块布', '插一楼:老虎上吊', '插一楼:王八', '插一楼:老虎吃田螺', '插一楼:大肚子踩钢丝', '插一楼:耗子腰粗', + '插一楼:乌龟的', '插一楼:神仙放屁', '插一楼:麻油煎豆腐', '插一楼:汽车坏了方向盘', '插一楼:病床上摘牡丹', '插一楼:芝麻地里撒黄豆', '插一楼:打开棺材喊捉贼', + '插一楼:卤煮寒鸭子', '插一楼:鲤鱼找鲤鱼,鲫鱼找鲫鱼', '插一楼:癞蛤蟆插羽毛', '插一楼:烂伞遮日', '插一楼:老虎头上的苍蝇', '插一楼:三角坟地', '插一楼:卖布兼卖盐', + '插一楼:耗子腰粗', '插一楼:老孔雀开屏', '插一楼:筐中捉鳖', '插一楼:拐子进医院', '插一楼:茅房里打灯笼', '插一楼:癞蛤蟆背小手', '插一楼:肉骨头吹喇叭', + '插一楼:老鼠进棺材', '插一楼:种地不出苗', '插一楼:病床上摘牡丹', '插一楼:裤裆里摸黄泥巴', '插一楼:狗拿耗子', '插一楼:铁匠铺的料', '插一楼:高梁撒在粟地里', + '插一楼:茅厕里题诗', '插一楼:痰盂里放屁', '插一楼:老母猪打喷嚏', '插一楼:厕所里点灯', '插一楼:棺材铺的买卖', '插一楼:老公鸡着火', '插一楼:乌龟翻筋斗', + '插一楼:被窝里的跳蚤', '插一楼:赶着牛车拉大粪', '插一楼:老太婆上鸡窝', '插一楼:狗背上贴膏药', '插一楼:狗咬瓦片', '插一楼:哪吒下凡', '插一楼:二十一天不出鸡', + '插一楼:鞭炮两头点', '插一楼:抱黄连敲门', '插一楼:猫儿踏破油瓶盖', '插一楼:和尚念经', '插一楼:裁缝不带尺', '插一楼:上山钓鱼', '插一楼:狗长犄角', + '插一楼:带着存折进棺材', '插一楼:豁子拜师', '插一楼:宁来看棋', '插一楼:盲公戴眼镜', '插一楼:南来的燕,北来的风', '插一楼:杯水车薪', '插一楼:玉皇大帝放屁', + '插一楼:给刺儿头理发', '插一楼:九月的甘蔗', '插一楼:两只公牛打架', '插一楼:百川归海', '插一楼:挨打的乌龟', '插一楼:和尚挖墙洞', '插一楼:八月十五蒸年糕', + '插一楼:毒蛇钻进竹筒里', '插一楼:苍蝇叮菩萨', '插一楼:白布进染缸', '插一楼:粪堆上开花', '插一楼:癞蛤蟆上蒸笼', + '插楼:沙漠里钓鱼', '插楼:青㭎树雕菩萨', '插楼:看鸭勿上棚', '插楼:下大雨前刮大风', '插楼:在看羊的狗', '插楼:耍大刀里唱小生', '插楼:罗锅上山', '插楼:大车不拉', + '插楼:瞎子白瞪眼', '插楼:铁拐的葫芦', '插楼:苣荬菜炖鲇鱼', '插楼:旅馆里的蚊子', '插楼:石刻底下的冰瘤子', '插楼:吃稀饭摆脑壳', '插楼:叫化子背不起', '插楼:火车拉大粪', + '插楼:寿星玩琵琶', '插楼:六月的腊肉', '插楼:夜叉骂街', '插楼:孩儿的脊梁', '插楼:长了个钱串子脑袋', '插楼:现场看乒乓球比赛', '插楼:寡妇梦丈夫', '插楼:马背上放屁', + '插楼:落雨出太阳', '插楼:猴子捡生姜', '插楼:啄木鸟屙薄屎', '插楼:鸡毛扔火里', '插楼:油火腿子被蛇咬', '插楼:属秦椒的', '插楼:千亩地里一棵草', '插楼:药铺倒了', + '插楼:黄连水做饭', '插楼:卸架的黄烟叶儿', '插楼:螺蛳壳里赛跑', '插楼:躲了和尚躲不了庙', '插楼:驴槽子里面伸出一颗头来', '插楼:老妈妈吃火锅', '插楼:阎王的脸', + '插楼:吃粮勿管事', '插楼:脚跟拴石头', '插楼:麻秸秆儿打狼', '插楼:阎王7粑子', '插楼:画上的美女', '插楼:团鱼下滚汤', '插楼:孔夫子的脸', '插楼:曹操贪慕小乔', + '插楼:蒙住眼睛走路', '插楼:炒菜不放盐', '插楼:三月里的桃花', '插楼:老鼠吃面饽', '插楼:粥锅里煮铁球', '插楼:戴起眼镜喝滚茶', '插楼:吃香油唱曲子', '插楼:过冬的咸菜缸', + '插楼:三个小鬼没抓住', '插楼:对着坛子放屁', '插楼:赤骨肋受棒', '插楼:百灵鸟唱歌', '插楼:雨过天晴放干雷', '插楼:拄着拐棍上炭窑', '插楼:搁着料吃草', '插楼:王八碰桥桩', + '插楼:水上油', '插楼:偷鸡不得摸了一只鸭子', '插楼:黄瓜熬白瓜', '插楼:海瑞的棺材', '插楼:蛤蟆翻田坎', '插楼:乌龟进砂锅', '插楼:夜壶出烟', '插楼:李逵骂宋江', + '插楼:小孩买个花棒槌', '插楼:漏网之虾', '插楼:一口吹灭火焰山', '插楼:冷水调浆'] + for key in keys: + # 不能搜索search + d(resourceId="com.ss.android.ugc.aweme:id/et_search_kw").clear_text() # 清除历史 + d(resourceId="com.ss.android.ugc.aweme:id/et_search_kw").set_text(key) # 输入 + time.sleep(1) + d(text='搜索', className='android.widget.TextView').click() # 点击搜索 + # d(resourceId=" com.ss.android.ugc.aweme:id/d0t").click() # 点击搜索 + time.sleep(1) + d(text='视频', className='android.widget.Button').click() # 点击视频,然后点击第一条 + d.xpath( + '//*[@resource-id="com.ss.android.ugc.aweme:id/gw1"]/android.widget.FrameLayout[1]').click() + stop, index = random.randint(15, 30), 0 + while index < stop: # 随机刷几十条 + print('总共有:{}条 | 现在到:{}条'.format(stop, index)) + try: + time.sleep(random.randint(3, 12)) # 随机停顿1~5秒 + d(resourceId="com.ss.android.ugc.aweme:id/b2b").click() # 点击评论按钮 + except Exception as e: + print(e) + try: + time.sleep(random.randint(1, 2)) # 随机停顿1~2秒 + d(resourceId="com.ss.android.ugc.aweme:id/b1y").click() # 点击弹出键盘 + except Exception as e: + print(e) + time.sleep(random.randint(1, 2)) # 随机停顿1~2秒 + try: + d(resourceId="com.ss.android.ugc.aweme:id/b1y").set_text( + comments[random.randint(0, len(comments) - 1)]) # 输入 + except Exception as e: + print(e) + try: + time.sleep(random.randint(1, 2)) # 随机停顿1~2秒 + d(resourceId="com.ss.android.ugc.aweme:id/b1r").click() # 发送 + except Exception as e: + print(e) + try: + time.sleep(random.randint(1, 2)) # 随机停顿1~2秒 + d(resourceId="com.ss.android.ugc.aweme:id/back_btn").click() # 关闭 + except Exception as e: + print(e) + try: + time.sleep(random.randint(5, 15)) # 随机停顿1~5秒 + d.swipe_ext("up") # 上划,下一个视频 + except Exception as e: + print(e) + index += 1 + d(resourceId="com.ss.android.ugc.aweme:id/back_btn").click() # 返回搜索 + + +''' +教程: +1. 使用 weditor 来查看元素 +2. 当找不到resourceId时,先用d(text='画动漫人物', className='android.widget.EditText').info找个resourceId,再使用。因为输入框内容会变化,所有不能直接用。 +''' + +if __name__ == "__main__": + review_douyin() diff --git a/006-TikTok/file_util.py b/006-TikTok/file_util.py new file mode 100644 index 0000000..961aa0a --- /dev/null +++ b/006-TikTok/file_util.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +""" +@Description: 文件相关处理 +@Date :2022/01/22 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" + +import datetime +import json +import os +import re +import shutil + +import cairosvg +import pandas as pd +import pypandoc # 要安装pandoc +from docx import Document + + +def file_name(file_dir): + results = [] + for root, dirs, files in os.walk(file_dir): + # print(root) # 当前目录路径 + # print(dirs) # 当前路径下所有子目录 + # print(files) # 当前路径下所有非目录子文件 + results += files + return results + + +def deal_one_page(): + fs = file_name('100条') + for f in fs: + try: + print('正在检测【%s】' % f) + shotname, extension = os.path.splitext('%s' % f) + print('正在检测【%s】' % shotname) + if '1篇' in shotname: + new_name = re.sub(r'1篇', '', f) + document = Document(r"html/%s" % f) + paragraphs = document.paragraphs + p = paragraphs[0] + p._element.getparent().remove(p._element) + document.save(r"html/%s" % new_name) + os.remove('html/%s' % f) + except Exception as e: + print(e) + + +def copy_doc(): + fs = file_name('all') + i = 1 + k = 1 + temp_dir = '01' + os.makedirs('100条/%s' % temp_dir) + for f in fs: + try: + # print('正在检测【%s】' % f) + shotname, extension = os.path.splitext('%s' % f) + shutil.copyfile(r'all/%s' % f, r'100条/%s/%s' % (temp_dir, f)) + if i % 100 == 0: + temp_dir = '0%d' % k if k < 10 else '%d' % k + k += 1 + os.makedirs('100条/%s' % temp_dir) + i += 1 + except Exception as e: + print(e) + + +'''########文件处理相关#########''' + + +def html_cover_doc(in_path, out_path): + '''将html转化成功doc''' + path, file_name = os.path.split(out_path) + if path and not os.path.exists(path): + os.makedirs(path) + pypandoc.convert_file(in_path, 'docx', outputfile=out_path) + + +def svg_cover_jpg(src, dst): + '''' + drawing = svg2rlg("drawing.svg") + renderPDF.drawToFile(drawing, "drawing.pdf") + renderPM.drawToFile(drawing, "fdrawing.png", fmt="PNG") + renderPM.drawToFile(drawing, "drawing.jpg", fmt="JPG") + ''' + path, file_name = os.path.split(dst) + if path and not os.path.exists(path): + os.makedirs(path) + # drawing = svg2rlg(src) + # renderPM.drawToFile(drawing, dst, fmt="JPG") + cairosvg.svg2png(url=src, write_to=dst) + + +def html_cover_excel(content, out_path): + '''将html转化成excel''' + path, file_name = os.path.split(out_path) + if path and not os.path.exists(path): + os.makedirs(path) + tables = pd.read_html(content, encoding='utf-8') + writer = pd.ExcelWriter(out_path) + for i in range(len(tables)): + tables[i].to_excel(writer, sheet_name='表%d' % (i + 1)) # startrow + writer.save() # 写入硬盘 + + +def write_to_html(content, file_path): + '''将内容写入本地,自动加上head等信息''' + page = ''' + + + + + ''' + page += content + page += ''' + ''' + write(page, file_path) + + +def write_json(content, file_path): + '''写入json''' + path, file_name = os.path.split(file_path) + if path and not os.path.exists(path): + os.makedirs(path) + with open(file_path, 'w') as f: + json.dump(content, f, ensure_ascii=False) + f.close() + + +def read_json(file_path): + '''读取json''' + with open(file_path, 'r') as f: + js_get = json.load(f) + f.close() + return js_get + + +def write(content, file_path): + '''写入txt文本内容''' + path, file_name = os.path.split(file_path) + if path and not os.path.exists(path): + os.makedirs(path) + with open(file_path, 'w') as f: + f.write(content) + f.close() + + +def read(file_path) -> str: + '''读取txt文本内容''' + content = None + try: + with open(file_path, 'r') as f: + content = f.read() + f.close() + except Exception as e: + print(e) + return content + + +def get_next_folder(dst, day_diff, folder, max_size): + '''遍历目录文件,直到文件夹不存在或者数目达到最大(max_size)时,返回路径''' + while True: + day_time = (datetime.date.today() + datetime.timedelta(days=day_diff)).strftime('%Y-%m-%d') # 下一天的目录继续遍历 + folder_path = os.path.join(dst, day_time, folder) + if os.path.exists(folder_path): # 已存在目录 + size = len(next(os.walk(folder_path))[2]) + if size >= max_size: # 该下一个目录了 + day_diff += 1 + continue + else: + os.makedirs(folder_path) + return day_diff, folder_path + + +if __name__ == '__main__': + pass diff --git a/006-TikTok/main.py b/006-TikTok/main.py new file mode 100644 index 0000000..23a5da0 --- /dev/null +++ b/006-TikTok/main.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +""" +@Description: tiktok 相关开源库 +@Date :2021/12/22 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" +# import TikTokApi +from TikTokAPI import TikTokAPI + +# https://github.com/avilash/TikTokAPI-Python + +if __name__ == "__main__": + pass diff --git a/006-TikTok/tikstar.py b/006-TikTok/tikstar.py new file mode 100644 index 0000000..6d8fb9b --- /dev/null +++ b/006-TikTok/tikstar.py @@ -0,0 +1,29 @@ +""" +@Description: 解析www.tikstar.com网站相关内容,获取tags +@Date :2021/12/22 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" +from bs4 import BeautifulSoup +import file_util as futil + + +def parse_tags(page): + '''解析页面,返回标题和文章页面内容,如果生成文章则还需要组装''' + soup = BeautifulSoup(page, 'html.parser') + trs = soup.find('tbody').find_all('tr') + result = [] + for tr in trs: + tds = tr.find_all('td') + tag_name = tds[0].find('h3').text.replace('\n', '').replace(' ', '') + video_num = tds[1].text.replace('\n', '').replace(' ', '') + views = tds[2].text.replace('\n', '').replace(' ', '') + result.append('标签:{} 视频数:{} 观看数:{}'.format(tag_name, video_num, views)) + return result + + +if __name__ == '__main__': + html = futil.read('source/tags.html') + result = parse_tags(html) + print(result) + futil.write_json(result, 'source/handmade.json') diff --git a/006-TikTok/tt_review.py b/006-TikTok/tt_review.py new file mode 100644 index 0000000..705ec25 --- /dev/null +++ b/006-TikTok/tt_review.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +""" +@Description: tiktok app(版本:22.8.2)刷评论脚本 +@Date :2021/12/22 +@Author :xhunmon +@Mail :xhunmon@gmail.com +""" +import random +import time +from datetime import datetime + +import uiautomator2 as u2 + +''' +https://github.com/openatx/uiautomator2 +运行pip3 install -U uiautomator2 安装uiautomator2 +运行python3 -m uiautomator2 init安装包含httprpc服务的apk到安卓手机 +uiautomator2操作:https://python.iitter.com/other/35522.html +借助:weditor 来获取元素 +(注意电脑要把代理关掉) +''' + +d = u2.connect() +print(d.info) +d.implicitly_wait(20) + +comments = ['good job', 'good', 'look me', 'crazy', 'emm...', 'I wish you a happy new year', + 'May you be happy, lucky and happy.', 'I wish you a happy new year and good luck!', + 'to put people and life above everything else', 'heroes in harm’s way', 'spirited', ' behind wave', + 'mythical creatures', 'dagongren, which refers to people who work for others', 'involution', + 'Versailles literature', 'Look at my', 'Too good', 'To learn', 'learned', 'Thank you', 'I got it.', + '666', 'nice', 'Well done', 'Look at my', 'Wonderful', 'Mine is not bad either.', 'Kudos', 'like u', + 'lean it', 'well...', '😊', 'My god!', 'Me too', 'I see', 'Come on', 'See you', 'Allow me', 'Have fun', + 'I\'m home', 'Bless you!', 'Follow me', 'Good luck!', 'Bottoms up!', 'Guess what?', 'Keep it up!', + 'Time is up', 'I like it!', 'That\'s neat', 'Let\'s face it.', 'Let\'s get started', 'Is that so', + 'That\'s something', 'Do you really mean it', 'Mind you', 'I am behind you', 'That depends', + 'What\'s up today?', 'Cut it out', 'What did you say', 'Knock it off', '[angel]', '[astonish]', + '[awkward]', '[blink]', '[complacent]', '[cool]', '[cool][cute]', '[cool][cool]', '[cool][cool][cool]', + '[cry]', '[cute]', '[cute][cute]', '[cute][cute][cute]', '[disdain]', '[drool]', + '[embarrassed]', '[evil]', '[excited]', '[facewithrollingeyes]', '[flushed]', '[funnyface]', '[greedy]', + '[happy]', '[hehe]', '[joyful]', '[laugh]', '[laughwithtears]', '[loveface]', '[lovely]', '[nap]', + '[pride]', '[proud]', '[rage]', '[scream]', '[shock]', '[shout]', '[slap]', '[smile]', '[smileface]', + '[speechless]', '[stun]', '[sulk]', '[surprised]', '[tears]', '[thinking]', '[weep]', '[wicked]', + '[wow]', '[wronged]', '[yummy]', + ' You kick ass.', ' You did a great job.', " You're a really strong person.", ' You read a lot.', + ' That was impressive.', ' Your work on that project was incredible.', ' Keep up the good work!', + " We're so proud of you.", ' How smart of you!', ' You have a real talent.', + ' Well, a good teacher makes good student.', ' I would like to compliment you on your diligence.', + " We're proud of you.", ' He has a long head.', ' You look great today.', " You're beautiful/gorgeous.", + ' You look so healthy.', ' You look like a million dollars.', ' You have a good taste.', + ' I am impressed.', ' You inspire me.', ' You are an amazing friend.', 'You are such a good friend.', + ' You have a good sense of humor.', " You're really talented.", " You're so smart.", + " You've got a great personality.", ' You are just perfect!', ' You are one of a kind.', + ' You make me want to be a better person.', 'brb', 'g2g', 'AMA', 'dbd', 'this look great', + 'we’re so proud of you.', 'nice place.', 'nice going! ', 'emm...amazing!', 'ohh...unbelievable!', + 'yeh,impressive.', 'terrific..', 'fantastic!', 'fabulous.', 'attractive..', 'hei...splendid.', + 'ooh, remarkable', 'gorgeous', 'h.., glamorous', 'marvelous.', 'brilliant..', 'well...glorious', + 'outstanding...', 'stunning!', 'appealing.', 'yeh,impressive[cool]', 'terrific[angel]', 'fantastic[cool]', + 'fabulous[angel]', 'attractive[cool]', 'splendid[angel]', 'remarkable[cool]', 'gorgeous[angel]', + 'glamorous[angel]', 'marvelous[cool]', 'brilliant[angel]', 'glorious[cool]', 'outstanding[angel]', + 'stunning[cool]', 'appealing[angel]', 'Would you like me?[angel]', 'Do you like crafts?[angel]', + 'I have a new creative work, welcome![thinking]'] + +print('总共有{}条随机评论!'.format(len(comments))) + + +def start_vpn(): # 启动代理app + d.press("home") + # d.app_stop('com.v2ray.ang') + d.app_start('com.v2ray.ang') + if 'Not' in d(resourceId="com.v2ray.ang:id/tv_test_state").get_text(): + print_t('正在启动v2ray...') + d(resourceId='com.v2ray.ang:id/fab').click() + if 'Connected' in d(resourceId="com.v2ray.ang:id/tv_test_state").get_text(): + print_t('启动v2ray完成,正在测试速度...') + d(resourceId='com.v2ray.ang:id/layout_test').click() + while 'Testing' in d(resourceId="com.v2ray.ang:id/tv_test_state").get_text(): + time.sleep(1) + print_t(d(resourceId="com.v2ray.ang:id/tv_test_state").get_text()) + + +def review_forYou(): + # d.press("home") + # d.app_start('com.zhiliaoapp.musically', stop=True) + time.sleep(1) + stop, index = random.randint(40, 70), 0 + while index < stop: # 随机刷几十条 + cur_comment = comments[random.randint(0, len(comments) - 1)] + print_t('foryou-总共有:{}条 | 现在到:{}条\t评论:{}'.format(stop, index, cur_comment)) + comment_foryou(cur_comment) + try: + time.sleep(random.randint(7, 35)) # 随机停顿1~5秒 + d.swipe_ext("up") # 上划,下一个视频 + except Exception as e: + print_t(e) + index += 1 + + +def review_tiktok(): # 评论 + keys = ['handmadecraft'] # '#homedecor #flowershower #handmadecraft' + d.press("home") + d.app_start('com.zhiliaoapp.musically', stop=False) + time.sleep(1) + try: + d(text='Discover', className='android.widget.TextView').click() # 点击发现 + time.sleep(1) + d(resourceId="com.zhiliaoapp.musically:id/fbt").click() # 点击搜索 + except Exception as e: + print_t(e) + for key in keys: + # 不能搜索search + try: + d(resourceId="com.zhiliaoapp.musically:id/b15").clear_text() # 清除历史 + d(resourceId="com.zhiliaoapp.musically:id/b15").set_text(key) # 输入 + d(resourceId="com.zhiliaoapp.musically:id/fap").click() # 点击搜索 + d(resourceId="android:id/text1", text="Videos").click() # 点击视频,然后点击第一条 + d.xpath( + '//*[@resource-id="com.zhiliaoapp.musically:id/cfh"]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]').click() + except Exception as e: + print_t(e) + stop, index = random.randint(35, 60), 0 + while index < stop: # 随机刷几十条 + cur_comment = comments[random.randint(0, len(comments) - 1)] + print_t('{}-总共有:{}条 | 现在到:{}条\t评论:{}'.format(key, stop, index, cur_comment)) + comment(cur_comment) + try: + time.sleep(random.randint(5, 20)) # 随机停顿1~5秒 + d.swipe_ext("up") # 上划,下一个视频 + except Exception as e: + print_t(e) + index += 1 + time.sleep(random.randint(int(0.5 * 60), 5 * 60)) + d(resourceId="com.zhiliaoapp.musically:id/t4").click() # 返回搜索 + + +def comment(content): + try: + time.sleep(random.randint(3, 10)) # 随机停顿1~5秒 + d(resourceId="com.zhiliaoapp.musically:id/acm").click() # 点击评论按钮 + except Exception as e: + print_t(e) + try: + time.sleep(random.randint(1, 3)) # 随机停顿1~2秒 + e_ele = d(text='Add comment...', className='android.widget.EditText') + e_ele.click() # 点击弹出键盘 + time.sleep(random.randint(1, 3)) # 随机停顿1~2秒 + e_ele.clear_text() + e_ele.set_text(content) # 输入 + except Exception as e: + print_t(e) + try: + time.sleep(random.randint(1, 3)) # 随机停顿1~2秒 + d(resourceId="com.zhiliaoapp.musically:id/ad6").click() # 发送 + except Exception as e: + print_t(e) + # 关闭系统键盘 + try: + time.sleep(random.randint(1, 3)) # 随机停顿1~2秒 + d(resourceId="com.zhiliaoapp.musically:id/t4").click() # 关闭评论 + except Exception as e: + print_t(e) + + +def comment_foryou(content): + try: + time.sleep(random.randint(3, 10)) # 随机停顿1~5秒 + d(resourceId="com.zhiliaoapp.musically:id/acm").click() # 点击评论按钮 + except Exception as e: + print_t(e) + try: + time.sleep(random.randint(1, 3)) # 随机停顿1~2秒 + e_ele = d(text='Add comment...', className='android.widget.EditText') + e_ele.click() # 点击弹出键盘 + time.sleep(random.randint(1, 3)) # 随机停顿1~2秒 + e_ele.clear_text() + e_ele.set_text(content) # 输入 + except Exception as e: + print_t(e) + try: + time.sleep(random.randint(1, 3)) # 随机停顿1~2秒 + d(resourceId="com.zhiliaoapp.musically:id/ad6").click() # 发送 + except Exception as e: + print_t(e) + # 关闭系统键盘 + try: + d.press("back") # 返回1,关闭系统键盘 + except Exception as e: + print_t(e) + + +def print_t(content): + dt = datetime.now() + print(dt.strftime('%H:%M:%S') + '\t' + str(content)) + + +if __name__ == "__main__": + # start_vpn() + # review_tiktok() + review_forYou() + # comment('Look at my') diff --git a/006-TikTok/v2ray_pool/__init__.py b/006-TikTok/v2ray_pool/__init__.py new file mode 100644 index 0000000..d0efa5d --- /dev/null +++ b/006-TikTok/v2ray_pool/__init__.py @@ -0,0 +1,11 @@ +# 运行时路径。并非__init__.py的路径 +import os +import sys + +BASE_DIR = "../002-V2rayPool" +if os.path.exists(BASE_DIR): + sys.path.append(BASE_DIR) + +from core import utils +from core.conf import Config +from db.db_main import DBManage \ No newline at end of file diff --git a/README.md b/README.md index fb2a8cf..c080adb 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,12 @@ - [003-Keywords:获取相关关键词以及其google趋势](./003-Keywords) ——已完成 - - [004-EmailNotify:监听虚拟币变化,使用邮箱通知](./004-EmailNotify) ——已完成 +- [005-PaidSource:这些脚本你肯定会有用到的](./005-PaidSource) ——已完成 + +- [006-TikTok:App自动化](./006-TikTok) ——已完成(持续更新) + ---------- ##### 声明:本项目仅用于学习交流,禁止任何商业用途,违者自承担相应法律责任!