diff --git a/src/application/main_complete.py b/src/application/main_complete.py index 1eaaeccd..358e01c2 100644 --- a/src/application/main_complete.py +++ b/src/application/main_complete.py @@ -1,1650 +1,1702 @@ -from datetime import date -# from datetime import datetime -from pathlib import Path -from platform import system -from time import time -from types import SimpleNamespace -from typing import Callable -from typing import TYPE_CHECKING -from typing import Union - -from ..custom import failure_handling -from ..custom import suspend -from ..downloader import Downloader -from ..extract import Extractor -from ..interface import ( - Account, - AccountTikTok, - Comment, - Detail, - Live, - Collection, - Mix, - Hot, - # Search, - User, - HashTag, - DetailTikTok, - CollectsMix, - LiveTikTok, - MixTikTok, - # CommentTikTok, - Collects, - # CollectsSeries, - CollectsMusic, - CollectsDetail, - Info, - InfoTikTok, -) -from ..link import Extractor as LinkExtractor -from ..link import ExtractorTikTok -from ..manager import Cache -from ..storage import RecordManager -from ..tools import TikTokDownloaderError -from ..tools import choose -from ..tools import safe_pop -from ..translation import _ - -if TYPE_CHECKING: - from ..config import Parameter - from ..manager import Database - -__all__ = [ - "TikTok", -] - - -def check_storage_format(function): - async def inner(self, *args, **kwargs): - if self.parameter.storage_format: - return await function(self, *args, **kwargs) - self.console.warning( - _("未设置 storage_format 参数,无法正常使用该功能,详细说明请查阅项目文档!"), - ) - - return inner - - -def check_cookie_state(tiktok=False): - def check_cookie(function): - async def inner(self, *args, **kwargs): - if tiktok: - params = self.parameter.cookie_tiktok_state - tip = "TikTok Cookie" - else: - params = self.parameter.cookie_state - tip = _("抖音 Cookie") - if params: - return await function(self, *args, **kwargs) - self.console.warning( - _("{tip} 未登录,无法使用该功能,详细说明请查阅项目文档!").format(tip=tip), - ) - - return inner - - return check_cookie - - -class TikTok: - ENCODE = "UTF-8-SIG" if system() == "Windows" else "UTF-8" - - def __init__(self, parameter: "Parameter", database: "Database", ): - self.run_command = None - self.parameter = parameter - self.database = database - self.console = parameter.console - self.logger = parameter.logger - self.links = LinkExtractor(parameter) - self.links_tiktok = ExtractorTikTok(parameter) - self.downloader = Downloader(parameter) - self.extractor = Extractor(parameter) - self.storage = bool(parameter.storage_format) - self.record = RecordManager() - self.settings = parameter.settings - self.accounts = parameter.accounts_urls - self.accounts_tiktok = parameter.accounts_urls_tiktok - self.mix = parameter.mix_urls - self.mix_tiktok = parameter.mix_urls_tiktok - self.owner = parameter.owner_url - self.owner_tiktok = parameter.owner_url_tiktok - self.running = True - self.ffmpeg = parameter.ffmpeg.state - self.cache = Cache( - parameter, - self.database, - "mark" in parameter.name_format, - "nickname" in parameter.name_format - ) - self.__function = ( - (_("批量下载账号作品(抖音)"), self.account_acquisition_interactive,), - (_("批量下载链接作品(抖音)"), self.detail_interactive,), - (_("获取直播推流地址(抖音)"), self.live_interactive,), - (_("采集作品评论数据(抖音)"), self.comment_interactive,), - (_("批量下载合集作品(抖音)"), self.mix_interactive,), - (_("采集账号详细数据(抖音)"), self.user_interactive,), - # (_("采集搜索结果数据(抖音)"),), - (_("采集抖音热榜数据(抖音)"), self.hot_interactive,), - # (_("批量下载话题作品(抖音)"),), - (_("批量下载收藏作品(抖音)"), self.collection_interactive,), - (_("批量下载收藏音乐(抖音)"), self.collection_music_interactive,), - # (_("批量下载收藏短剧(抖音)"),), - (_("批量下载收藏夹作品(抖音)"), self.collects_interactive,), - (_("批量下载账号作品(TikTok)"), self.account_acquisition_interactive_tiktok,), - (_("批量下载链接作品(TikTok)"), self.detail_interactive_tiktok,), - (_("批量下载合集作品(TikTok)"), self.mix_interactive_tiktok,), - (_("获取直播推流地址(TikTok)"), self.live_interactive_tiktok,), - # (_("采集作品评论数据(TikTok)"), self.comment_interactive_tiktok,), - ) - self.__function_account = ( - (_("使用 accounts_urls 参数的账号链接(推荐)"), self.account_detail_batch), - (_("手动输入待采集的账号链接"), self.account_detail_inquire), - (_("从文本文档读取待采集的账号链接"), self.account_detail_txt), - ) - self.__function_account_tiktok = ( - (_("使用 accounts_urls_tiktok 参数的账号链接(推荐)"), self.account_detail_batch_tiktok), - (_("手动输入待采集的账号链接"), self.account_detail_inquire_tiktok), - (_("从文本文档读取待采集的账号链接"), self.account_detail_txt_tiktok), - ) - self.__function_mix = ( - (_("使用 mix_urls 参数的合集链接(推荐)"), self.mix_batch), - (_("获取当前账号收藏合集列表"), self.mix_collection), - (_("手动输入待采集的合集/作品链接"), self.mix_inquire), - (_("从文本文档读取待采集的合集/作品链接"), self.mix_txt), - ) - self.__function_mix_tiktok = ( - (_("使用 mix_urls_tiktok 参数的合集链接(推荐)"), self.mix_batch_tiktok), - (_("手动输入待采集的合集/作品链接"), self.mix_inquire_tiktok), - (_("从文本文档读取待采集的合集/作品链接"), self.mix_txt_tiktok), - ) - self.__function_user = ( - (_("使用 accounts_urls 参数的账号链接"), self.user_batch), - (_("手动输入待采集的账号链接"), self.user_inquire), - (_("从文本文档读取待采集的账号链接"), self.user_txt), - ) - self.__function_detail = ( - (_("手动输入待采集的作品链接"), self.__detail_inquire), - (_("从文本文档读取待采集的作品链接"), self.__detail_txt), - ) - self.__function_detail_tiktok = ( - (_("手动输入待采集的作品链接"), self.__detail_inquire_tiktok), - (_("从文本文档读取待采集的作品链接"), self.__detail_txt_tiktok), - ) - self.__function_comment = ( - (_("手动输入待采集的作品链接"), self.__comment_inquire), - (_("从文本文档读取待采集的作品链接"), self.__comment_txt), - ) - self.__function_comment_tiktok = ( - (_("手动输入待采集的作品链接"), self.__comment_inquire_tiktok), - # (_("从文本文档读取待采集的作品链接"), self.__comment_txt_tiktok), - ) - - def _inquire_input(self, tip: str = "", problem: str = "", ) -> str: - text = self.console.input(problem or _("请输入{tip}链接: ").format(tip=tip)) - if not text: - return "" - elif text.upper() == "Q": - self.running = False - return "" - return text - - async def account_acquisition_interactive_tiktok( - self, - select="", - ): - await self.__secondary_menu( - _("请选择账号链接来源"), - function=self.__function_account_tiktok, - select=select or safe_pop(self.run_command), - ) - self.logger.info(_("已退出批量下载账号作品(TikTok)模式")) - - def __summarize_results(self, count: SimpleNamespace, name=_("账号")): - time_ = time() - count.time - self.logger.info( - _("程序共处理 {0} 个{1},成功 {2} 个,失败 {3} 个,耗时 {4} 分钟 {5} 秒").format( - count.success + count.failed, - name, - count.success, - count.failed, - int(time_ // 60), - int(time_ % 60), - )) - - async def account_acquisition_interactive( - self, - select="", - ): - await self.__secondary_menu( - _("请选择账号链接来源"), - function=self.__function_account, - select=select or safe_pop(self.run_command), - ) - self.logger.info(_("已退出批量下载账号作品(抖音)模式")) - - async def __secondary_menu( - self, - problem=_("请选择账号链接来源"), - function=..., - select: str | int = ..., - *args, - **kwargs, - ): - if not select: - select = choose( - problem, - [i[0] for i in function], - self.console, - ) - if select.upper() == "Q": - self.running = False - try: - n = int(select) - 1 - except ValueError: - return - if n in range(len(function)): - await function[n][1](*args, **kwargs, ) - - async def account_detail_batch(self, *args, ): - await self.__account_detail_batch( - self.accounts, - "accounts_urls", - False, - ) - - async def account_detail_batch_tiktok(self, *args, ): - await self.__account_detail_batch( - self.accounts_tiktok, - "accounts_urls_tiktok", - True, - ) - - async def __account_detail_batch( - self, - accounts: list[SimpleNamespace], - params_name: str, - tiktok: bool, - ) -> None: - count = SimpleNamespace(time=time(), success=0, failed=0) - self.logger.info(_("共有 {count} 个账号的作品等待下载").format(count=len(accounts))) - for index, data in enumerate(accounts, start=1): - if hasattr(data, "enable") and not data.enable: - continue - if not (sec_user_id := await self.check_sec_user_id( - data.url, - tiktok, - )): - self.logger.warning( - _("配置文件 {name} 参数第 {index} 条数据的 url {url} 错误,提取 sec_user_id 失败").format( - name=params_name, - index=index, - url=data.url, - )) - count.failed += 1 - continue - if not await self.deal_account_detail( - index, - **vars(data) | {"sec_user_id": sec_user_id}, - tiktok=tiktok, - ): - count.failed += 1 - continue - # break # 调试代码 - count.success += 1 - if index != len(accounts): - await suspend(index, self.console) - self.__summarize_results(count, _("账号"), ) - - async def check_sec_user_id(self, sec_user_id: str, tiktok=False, ) -> str: - match tiktok: - case True: - sec_user_id = await self.links_tiktok.run(sec_user_id, "user") - case False: - sec_user_id = await self.links.run(sec_user_id, "user") - return sec_user_id[0] if len(sec_user_id) > 0 else "" - - async def account_detail_inquire(self, *args, ): - while url := self._inquire_input(_("账号主页")): - links = await self.links.run(url, "user") - if not links: - self.logger.warning(_("{url} 提取账号 sec_user_id 失败").format(url=url)) - continue - await self.__account_detail_handle(links, False, *args, ) - - async def account_detail_inquire_tiktok(self, *args, ): - while url := self._inquire_input(_("账号主页")): - links = await self.links_tiktok.run(url, "user") - if not links: - self.logger.warning(_("{url} 提取账号 sec_user_id 失败").format(url=url)) - continue - await self.__account_detail_handle(links, True, *args, ) - - async def account_detail_txt(self, ): - await self._read_from_txt( - tiktok=False, - type_="user", - error=_("从文本文档提取账号 sec_user_id 失败"), - callback=self.__account_detail_handle, - ) - - async def _read_from_txt( - self, - tiktok=False, - type_: str = ..., - error: str = ..., - callback: Callable = ..., - *args, - **kwargs, - ): - if not (url := self.txt_inquire()): - return - link_obj = self.links_tiktok if tiktok else self.links - links = await link_obj.run(url, type_, ) - if not links or not isinstance(links[0], bool | None): - links = [links] - if not links[-1]: - self.logger.warning(error) - return - await callback(*links, *args, tiktok=tiktok, **kwargs, ) - - async def account_detail_txt_tiktok(self, ): - await self._read_from_txt( - tiktok=True, - type_="user", - error=_("从文本文档提取账号 sec_user_id 失败"), - callback=self.__account_detail_handle, - ) - - async def __account_detail_handle( - self, - links, - tiktok=False, - *args, - **kwargs, - ): - count = SimpleNamespace(time=time(), success=0, failed=0) - for index, sec in enumerate(links, start=1): - if not await self.deal_account_detail( - index, - sec_user_id=sec, - tiktok=tiktok, - *args, - **kwargs, - ): - count.failed += 1 - continue - count.success += 1 - if index != len(links): - await suspend(index, self.console) - self.__summarize_results(count, _("账号"), ) - - async def deal_account_detail( - self, - index: int, - sec_user_id: str, - mark="", - tab="post", - earliest="", - latest="", - pages: int = None, - api=False, - source=False, - cookie: str = None, - proxy: str = None, - tiktok=False, - *args, - **kwargs, - ): - self.logger.info(_("开始处理第 {index} 个账号").format(index=index) if index else _("开始处理账号")) - info = None - if tab in { - "favorite", - "collection", - }: - if not (info := await self.get_user_info_data( - tiktok, - cookie, - proxy, - sec_user_id=sec_user_id, - )): - self.logger.warning(_("{sec_user_id} 获取账号信息失败").format(sec_user_id=sec_user_id)) - return - acquirer = self._get_account_data_tiktok if tiktok else self._get_account_data - account_data, earliest, latest = await acquirer( - cookie=cookie, - proxy=proxy, - sec_user_id=sec_user_id, - tab=tab, - earliest=earliest, - latest=latest, - pages=pages, - ) - if not any(account_data): - return None - if source: - return self.extractor.source_date_filter( - account_data, - earliest, - latest, - tiktok, - ) - return await self._batch_process_detail( - account_data, - user_id=sec_user_id, - mark=mark, - api=api, - earliest=earliest, - latest=latest, - tiktok=tiktok, - mode=tab, - info=info, - ) - - async def _get_account_data( - self, - cookie: str = None, - proxy: str = None, - sec_user_id: Union[str] = ..., - tab: str = "post", - earliest: str = "", - latest: str = "", - pages: int = None, - *args, - **kwargs, - ): - return await Account( - self.parameter, - cookie, - proxy, - sec_user_id, - tab, - earliest, - latest, - pages, - ).run() - - async def _get_account_data_tiktok( - self, - cookie: str = None, - proxy: str = None, - sec_user_id: Union[str] = ..., - tab: str = "post", - earliest: str = "", - latest: str = "", - pages: int = None, - *args, - **kwargs, - ): - return await AccountTikTok( - self.parameter, - cookie, - proxy, - sec_user_id, - tab, - earliest, - latest, - pages, - ).run() - - async def get_user_info_data( - self, - tiktok=False, - cookie: str = None, - proxy: str = None, - unique_id: Union[str] = "", - sec_user_id: Union[str] = "", - ): - return ( - await self._get_info_data_tiktok( - cookie, - proxy, - unique_id, - sec_user_id, - ) - if tiktok - else await self._get_info_data( - cookie, - proxy, - sec_user_id, - ) - ) - - async def _get_info_data( - self, - cookie: str = None, - proxy: str = None, - sec_user_id: Union[str, list[str]] = ..., - ): - return await Info( - self.parameter, - cookie, - proxy, - sec_user_id, - ).run() - - async def _get_info_data_tiktok( - self, - cookie: str = None, - proxy: str = None, - unique_id: Union[str] = "", - sec_user_id: Union[str] = "", - ): - return await InfoTikTok( - self.parameter, - cookie, - proxy, - unique_id, - sec_user_id, - ).run() - - async def _batch_process_detail( - self, - data: list[dict], - api: bool = False, - earliest: date = None, - latest: date = None, - tiktok: bool = False, - info: dict = None, - mode: str = "", - mark: str = "", - user_id: str = "", - mix_id: str = "", - mix_title: str = "", - collect_id: str = "", - collect_name: str = "", - ): - self.logger.info(_("开始提取作品数据")) - id_, name, mark = self.extractor.preprocessing_data( - info or data, - tiktok, - mode, - mark, - user_id, - mix_id, - mix_title, - collect_id, - collect_name, - ) - self.__display_extracted_information(id_, name, mark, ) - prefix = self._generate_prefix(mode) - suffix = self._generate_suffix(mode) - old_mark = f"{m["MARK"]}_{suffix}" if ( - m := await self.cache.has_cache(id_) - ) else None - root, params, logger = self.record.run(self.parameter) - async with logger( - root, - name=f"{prefix}{id_}_{mark}_{suffix}", - old=old_mark, - console=self.console, - **params, - ) as recorder: - data = await self.extractor.run( - data, - recorder, - type_="batch", - tiktok=tiktok, - name=name, - mark=mark, - earliest=earliest or date(2016, 9, 20), - latest=latest or date.today(), - same=mode in { - "post", - "mix", - }, - ) - if api: - return data - await self.cache.update_cache( - self.parameter.folder_mode, - prefix, - suffix, - id_, - name, - mark, - ) - await self.download_detail_batch( - data, - tiktok=tiktok, - mode=mode, - mark=mark, - user_id=id_, - user_name=name, - mix_id=mix_id, - mix_title=mix_title, - collect_id=collect_id, - collect_name=collect_name, - ) - return True - - @staticmethod - def _generate_prefix(mode: str, ): - match mode: - case "post" | "favorite" | "collection": - return "UID" - case "mix": - return "MID" - case "collects": - return "CID" - case _: - raise TikTokDownloaderError - - @staticmethod - def _generate_suffix(mode: str, ): - match mode: - case "post": - return _("发布作品") - case "favorite": - return _("喜欢作品") - case "collection": - return _("收藏作品") - case "mix": - return _("合集作品") - case "collects": - return _("收藏夹作品") - case _: - raise TikTokDownloaderError - - def __display_extracted_information( - self, - id_: str, - name: str, - mark: str, - ) -> None: - self.logger.info(_("昵称/标题:{name};标识:{mark};ID:{id}").format( - name=name, - mark=mark, - id=id_, - ), ) - - async def download_detail_batch( - self, - data: list[dict], - type_: str = "batch", - tiktok: bool = False, - mode: str = "", - mark: str = "", - user_id: str = "", - user_name: str = "", - mix_id: str = "", - mix_title: str = "", - collect_id: str = "", - collect_name: str = "", - ): - await self.downloader.run( - data, - type_, - tiktok, - mode=mode, - mark=mark, - user_id=user_id, - user_name=user_name, - mix_id=mix_id, - mix_title=mix_title, - collect_id=collect_id, - collect_name=collect_name, - ) - - async def detail_interactive(self, select="", ): - await self.__secondary_menu( - _("请选择作品链接来源"), - self.__function_detail, - select or safe_pop(self.run_command), - ) - self.logger.info(_("已退出批量下载链接作品(抖音)模式")) - - async def detail_interactive_tiktok(self, select="", ): - await self.__detail_secondary_menu( - self.__function_detail_tiktok, - select or safe_pop(self.run_command), - ) - self.logger.info(_("已退出批量下载链接作品(TikTok)模式")) - - async def __detail_secondary_menu(self, menu, select="", *args, **kwargs): - root, params, logger = self.record.run(self.parameter) - async with logger(root, console=self.console, **params) as record: - if not select: - select = choose( - _("请选择作品链接来源"), - [i[0] for i in menu], - self.console, - ) - if select.upper() == "Q": - self.running = False - try: - n = int(select) - 1 - except ValueError: - return - if n in range(len(menu)): - await menu[n][1](record) - - async def __detail_inquire(self, tiktok=False, ): - root, params, logger = self.record.run(self.parameter) - link_obj = self.links_tiktok if tiktok else self.links - async with logger(root, console=self.console, **params) as record: - while url := self._inquire_input(_("作品")): - ids = await link_obj.run(url) - if not any(ids): - self.logger.warning(_("{url} 提取作品 ID 失败").format(url=url)) - continue - self.console.print(_("共提取到 {count} 个作品,开始处理!").format(count=len(ids))) - await self._handle_detail(ids, tiktok, record, ) - - async def __detail_inquire_tiktok(self, tiktok=True, ): - await self.__detail_inquire(tiktok, ) - - async def __detail_txt(self, tiktok=False, ): - root, params, logger = self.record.run(self.parameter) - async with logger(root, console=self.console, **params) as record: - await self._read_from_txt( - tiktok, - "detail", - _("从文本文档提取作品 ID 失败"), - self._handle_detail, - record=record, - ) - - async def __detail_txt_tiktok(self, tiktok=True, ): - await self.__detail_txt(tiktok=tiktok, ) - - async def __read_detail_txt(self): - if not (url := self.txt_inquire()): - return - ids = await self.links.run(url) - if not any(ids): - self.logger.warning(_("从文本文档提取作品 ID 失败")) - return - self.console.print(_("共提取到 {count} 个作品,开始处理!").format(count=len(ids))) - return ids - - async def _handle_detail( - self, - ids: list[str], - tiktok: bool, - record, - api=False, - source=False, - cookie: str = None, - proxy: str = None, - ): - obj = DetailTikTok if tiktok else Detail - return await self.__handle_detail( - tiktok, - obj, - ids, - record, - api=api, - source=source, - cookie=cookie, - proxy=proxy, - ) - - async def __handle_detail( - self, - tiktok: bool, - request_obj: Callable, - ids: list[str], - record, - api=False, - source=False, - cookie: str = None, - proxy: str = None, - ): - detail_data = [ - await request_obj( - self.parameter, - cookie, - proxy, - i, - ).run() for i in ids - ] - if not any(detail_data): - return None - if source: - return detail_data - detail_data = await self.extractor.run(detail_data, record, tiktok=tiktok, ) - if api: - return detail_data - await self.downloader.run(detail_data, "detail", tiktok=tiktok) - return self._get_preview_image(detail_data[0]) - - @staticmethod - def _get_preview_image(data: dict) -> str: - if data["type"] == _("图集"): - return data["downloads"][0] - elif data["type"] == _("视频"): - return data["origin_cover"] - return "" - - def _choice_live_quality( - self, - flv_items: dict, - m3u8_items: dict, - ) -> tuple | None: - if not self.ffmpeg: - self.logger.warning(_("程序未检测到有效的 ffmpeg,不支持直播下载功能!")) - return None - try: - choice_ = self.console.input( - _("请选择下载清晰度(输入清晰度或者对应序号,直接回车代表不下载): "), - ) - if u := flv_items.get(choice_): - return u, m3u8_items.get(choice_) - if not 0 <= (i := int(choice_) - 1) < len(flv_items): - raise ValueError - except ValueError: - return None - return list(flv_items.values())[i], list(m3u8_items.values())[i] - - async def live_interactive( - self, - cookie: str = None, - proxy: str = None, - *args, - ): - while url := self._inquire_input(_("直播")): - params = self._generate_live_params(*await self.links.run(url, type_="live")) - if not params: - self.logger.warning(_("{} 提取直播 ID 失败").format(url=url)) - continue - live_data = [await Live(self.parameter, cookie, proxy, **i).run() for i in params] - if not [i for i in live_data if i]: - self.logger.warning(_("获取直播数据失败")) - continue - live_data = await self.extractor.run(live_data, None, "live") - download_tasks = self.show_live_info(live_data) - await self.downloader.run(download_tasks, type_="live") - self.logger.info(_("已退出获取直播推流地址(抖音)模式")) - - async def live_interactive_tiktok( - self, - cookie: str = None, - proxy: str = None, - *args, - ): - while url := self._inquire_input(_("直播")): - _, ids = await self.links_tiktok.run(url, type_="live") - if not ids: - self.logger.warning(_("{} 提取直播 ID 失败").format(url=url)) - continue - live_data = [await LiveTikTok(self.parameter, cookie, proxy, i).run() for i in ids] - if not [i for i in live_data if i]: - self.logger.warning_(_("获取直播数据失败")) - continue - live_data = await self.extractor.run(live_data, None, "live", tiktok=True, ) - download_tasks = self.show_live_info_tiktok(live_data) - await self.downloader.run(download_tasks, type_="live", tiktok=True) - self.logger.info(_("已退出获取直播推流地址(TikTok)模式")) - - def _generate_live_params(self, rid: bool, ids: list[list]) -> list[dict]: - if not ids: - self.console.warning(_("提取 web_rid 或者 room_id 失败!"), ) - return [] - if rid: - return [{"web_rid": id_} for id_ in ids] - else: - return [{"room_id": id_[0], "sec_user_id": id_[1]} for id_ in ids] - - def show_live_info(self, data: list[dict]) -> list[tuple]: - download_tasks = [] - for item in data: - self.console.print(_("直播标题:"), item["title"]) - self.console.print(_("主播昵称:"), item["nickname"]) - self.console.print(_("在线观众:"), item["user_count_str"]) - self.console.print(_("观看次数:"), item["total_user_str"]) - if item["status"] == 4: - self.console.print(_("当前直播已结束!")) - continue - self.show_live_stream_url(item, download_tasks) - return [i for i in download_tasks if isinstance(i, tuple)] - - def show_live_info_tiktok(self, data: list[dict]) -> list[tuple]: - download_tasks = [] - for item in data: - if item["message"]: - self.console.print(item["message"]) - self.console.print(item["prompts"]) - continue - self.console.print(_("直播标题:"), item["title"]) - self.console.print(_("主播昵称:"), item["nickname"]) - self.console.print(_("开播时间:"), item["create_time"]) - self.console.print(_("在线观众:"), item["user_count"]) - self.console.print(_("点赞次数:"), item["like_count"]) - self.show_live_stream_url_tiktok(item, download_tasks) - self.console.print("TikTok 直播下载功能尚未完成!") # 完成后移除 - return [i for i in download_tasks if isinstance(i, tuple)] - - def show_live_stream_url(self, item: dict, tasks: list): - self.console.print(_("FLV 推流地址: ")) - for i, (k, v) in enumerate(item["flv_pull_url"].items(), start=1): - self.console.print(i, k, v) - self.console.print(_("M3U8 推流地址: ")) - for i, (k, v) in enumerate(item["hls_pull_url_map"].items(), start=1): - self.console.print(i, k, v) - if self.parameter.download: - tasks.append( - ( - item, - *u - ) if ( - u := self._choice_live_quality( - item["flv_pull_url"], - item["hls_pull_url_map"], - )) else u - ) - - def show_live_stream_url_tiktok(self, item: dict, tasks: list): - self.console.print(_("FLV 推流地址: ")) - for i, (k, v) in enumerate(item["flv_pull_url"].items(), start=1): - self.console.print(i, k, v) - # TODO: TikTok 直播下载功能异常,代理错误 - # if self.parameter.download: - # tasks.append( - # ( - # item, - # *u, - # ) if ( - # # TikTok 平台 暂无 m3u8 地址 - # u := self._choice_live_quality( - # item["flv_pull_url"], - # item["flv_pull_url"], - # ) - # ) else u - # ) - - @check_storage_format - async def comment_interactive_tiktok(self, select="", *args, **kwargs): - ... - self.logger.info(_("已退出采集作品评论数据(TikTok)模式")) - - @check_storage_format - async def comment_interactive(self, select="", ): - await self.__secondary_menu( - _("请选择作品链接来源"), - self.__function_comment, - select or safe_pop(self.run_command), - ) - self.logger.info(_("已退出采集作品评论数据(抖音)模式)")) - - async def __comment_inquire( - self, - tiktok=False, - ): - link = self.links_tiktok if tiktok else self.links - while url := self._inquire_input(_("作品")): - ids = await link.run(url, ) - if not any(ids): - self.logger.warning(_("{url} 提取作品 ID 失败").format(url=url)) - continue - self.console.print(_("共提取到 {count} 个作品,开始处理!").format(count=len(ids))) - await self.__comment_handle( - ids, - tiktok=tiktok, - ) - - async def __comment_inquire_tiktok(self, ): - await self.__comment_inquire(True, ) - - async def __comment_txt(self, tiktok=False, ): - await self._read_from_txt( - tiktok, - "detail", - _("从文本文档提取作品 ID 失败"), - self.__comment_handle, - ) - - async def __comment_handle( - self, - ids: list, - tiktok=False, - cookie: str = None, - proxy: str = None, - ): - if tiktok: # TODO: 代码未完成 - ... - else: - for i in ids: - name = _("作品{id}_评论数据").format(id=i) - root, params, logger = self.record.run(self.parameter, type_="comment") - async with logger(root, name=name, console=self.console, **params) as record: - if d := await Comment( - self.parameter, - cookie, - proxy, - item_id=i, - reply=False, - ).run(): - await self.extractor.run(d, record, type_="comment") - self.logger.info(_("作品评论数据已储存至 {filename}").format(filename=name)) - else: - self.logger.warning(_("采集评论数据失败")) - - async def mix_interactive(self, select="", ): - await self.__secondary_menu( - _("请选择合集链接来源"), - self.__function_mix, - select or safe_pop(self.run_command), - ) - self.logger.info(_("已退出批量下载合集作品(抖音)模式")) - - async def mix_interactive_tiktok(self, select="", ): - await self.__secondary_menu( - _("请选择合集链接来源"), - self.__function_mix_tiktok, - select or safe_pop(self.run_command), - ) - self.logger.info(_("已退出批量下载合集作品(TikTok)模式")) - - @staticmethod - def _generate_mix_params(mix: bool, id_: str) -> dict: - return {"mix_id": id_, } if mix else {"detail_id": id_, } - - async def mix_inquire(self, ): - while url := self._inquire_input(_("合集或作品")): - mix_id, ids = await self.links.run(url, type_="mix") - if not ids: - self.logger.warning(_("{url} 获取作品 ID 或合集 ID 失败").format(url=url)) - continue - await self.__mix_handle(mix_id, ids, ) - - async def mix_inquire_tiktok(self, ): - while url := self._inquire_input(_("合集或作品")): - __, ids, title = await self.links_tiktok.run(url, type_="mix") - if not ids: - self.logger.warning(_("{url} 获取合集 ID 失败").format(url=url)) - continue - await self.__mix_handle(True, ids, title, True, ) - - @check_cookie_state(tiktok=False) - async def mix_collection(self, ): - if id_ := await self.mix_inquire_collection(): - await self.__mix_handle(True, id_, ) - - async def mix_inquire_collection(self) -> list[str]: - data = await CollectsMix(self.parameter).run() - if not any(data): - return [] - data = self.extractor.extract_mix_collect_info(data) - return self.input_download_index(data) - - def input_download_index(self, data: list[dict]) -> list[str] | None: - if d := self.__input_download_index(data, _("收藏合集"), ): - return [i["id"] for i in d] - - def __input_download_index( - self, - data: list[dict], - text=_("收藏合集"), - key="title", - ) -> list[dict] | None: - self.console.print(_("{text}列表:").format(text=_(text))) - for i, j in enumerate(data, start=1): - self.console.print(f"{i}. {j[key]}") - index = self.console.input( - _("请输入需要下载的{item}序号(多个序号使用空格分隔,输入 ALL 下载全部{item}):").format(item=text)) - try: - if not index: - pass - elif index.upper() == "ALL": - return data - elif index.upper() == "Q": - self.running = False - else: - index = {int(i) for i in index.split()} - return [j for i, j in enumerate(data, start=1) if i in index] - except ValueError: - self.console.warning(_("{text}序号输入错误!").format(text=text)) - - async def mix_txt(self, ): - await self._read_from_txt( - tiktok=False, - type_="mix", - error=_("从文本文档提取作品 ID 或合集 ID 失败"), - callback=self.__mix_handle, - ) - - async def mix_txt_tiktok(self, ): - await self._read_from_txt( - tiktok=True, - type_="mix", - error=_("从文本文档提取合集 ID 失败"), - callback=self.__mix_handle, - ) - - if not (url := self.txt_inquire()): - return - __, ids, title = await self.links_tiktok.run(url, type_="mix") - if not ids: - self.logger.warning() - return - await self.__mix_handle(True, ids, title, True, ) - - async def __mix_handle( - self, - mix_id: bool, - ids: list[str], - mix_title_map: list[str] = None, - tiktok=False, - ): - count = SimpleNamespace(time=time(), success=0, failed=0) - for index, i in enumerate(ids, start=1): - if not await self._deal_mix_detail( - mix_id, - i, - index=index, - tiktok=tiktok, - mix_title=mix_title_map[index - 1] if mix_title_map else None, - ): - count.failed += 1 - if index != len(ids) and failure_handling(): - continue - break - count.success += 1 - if index != len(ids): - await suspend(index, self.console) - self.__summarize_results(count, _("合集"), ) - - async def mix_batch(self, ): - await self.__mix_batch( - self.mix, - "mix_urls", - False, - ) - - async def mix_batch_tiktok(self, ): - await self.__mix_batch( - self.mix_tiktok, - "mix_urls_tiktok", - True, - ) - - async def __mix_batch( - self, - mix: list[SimpleNamespace], - params_name: str, - tiktok: bool, - ): - count = SimpleNamespace(time=time(), success=0, failed=0) - for index, data in enumerate(mix, start=1): - if hasattr(data, "enable") and not data.enable: - continue - mix_id, id_, title = await self._check_mix_id(data.url, tiktok, ) - if not id_: - self.logger.warning( - _("配置文件 {name} 参数第 {index} 条数据的 url {url} 错误,获取作品 ID 或合集 ID 失败").format( - name=params_name, - index=index, - url=data.url, - )) - count.failed += 1 - continue - if not await self._deal_mix_detail( - mix_id, - id_, - data.mark, - index, - tiktok=tiktok, - mix_title=title, - ): - count.failed += 1 - continue - count.success += 1 - if index != len(mix): - await suspend(index, self.console) - self.__summarize_results(count, _("合集"), ) - - async def _deal_mix_detail( - self, - mix_id: bool = None, - id_: str = None, - mark="", - index: int = 0, - api=False, - source=False, - cookie: str = None, - proxy: str = None, - tiktok=False, - mix_title: str = "", - ): - self.logger.info(_("开始处理第 {index} 个合集").format(index=index) if index else _("开始处理合集")) - mix_params = self._generate_mix_params(mix_id, id_) - if tiktok: - mix_obj = MixTikTok( - self.parameter, - cookie, - proxy, - mix_title=mix_title, - **mix_params, - ) - else: - mix_obj = Mix( - self.parameter, - cookie, - proxy, - **mix_params, - ) - if any( - mix_data := await mix_obj.run() - ): - return ( - mix_data - if source - else await self._batch_process_detail( - mix_data, - mode="mix", - mix_id=mix_obj.mix_id, - mark=mark, - api=api, - tiktok=tiktok, - ) - ) - self.logger.warning(_("采集合集作品数据失败")) - - async def _check_mix_id(self, url: str, tiktok: bool, ) -> tuple[bool, str, str]: - match tiktok: - case True: - _, ids, title = await self.links_tiktok.run(url, type_="mix") - return (True, ids[0], title[0]) if len(ids) > 0 else (None, "", "") - case False: - mix_id, ids = await self.links.run(url, type_="mix") - return (mix_id, ids[0], "") if len(ids) > 0 else (mix_id, "", "") - - async def user_batch(self, *args, **kwargs, ): - users = [] - for index, data in enumerate(self.accounts, start=1): - if not (sec_user_id := await self.check_sec_user_id(data.url)): - self.logger.warning( - _("配置文件 accounts_urls 参数第 {index} 条数据的 url 无效").format(index=index), - ) - continue - users.append(await self._get_user_data(sec_user_id)) - await self._deal_user_data([i for i in users if i]) - - async def user_inquire(self, *args, **kwargs, ): - while url := self._inquire_input(_("账号主页")): - sec_user_ids = await self.links.run(url, type_="user") - if not sec_user_ids: - self.logger.warning(_("{url} 提取账号 sec_user_id 失败").format(url=url)) - continue - users = [await self._get_user_data(i) for i in sec_user_ids] - await self._deal_user_data([i for i in users if i]) - - def txt_inquire(self) -> str: - if path := self.console.input(_("请输入文本文档路径:")): - if (t := Path(path.replace("\"", ""))).is_file(): - try: - with t.open("r", encoding=self.ENCODE) as f: - return f.read() - except UnicodeEncodeError as e: - self.logger.warning(_("{path} 文件读取异常: {error}").format(path=path, error=e)) - else: - self.console.print(_("{path} 文件不存在!").format(path=path)) - return "" - - async def user_txt(self, *args, **kwargs, ): - if not (url := self.txt_inquire()): - return - sec_user_ids = await self.links.run(url, type_="user") - if not sec_user_ids: - self.logger.warning(_("从文本文档提取账号 sec_user_id 失败")) - return - users = [await self._get_user_data(i) for i in sec_user_ids] - await self._deal_user_data([i for i in users if i]) - - async def _get_user_data( - self, - sec_user_id: str, - cookie: str = None, - proxy: str = None, - ): - self.logger.info(_("正在获取账号 {sec_user_id} 的数据").format(sec_user_id=sec_user_id)) - data = await User(self.parameter, cookie, proxy, sec_user_id, ).run() - return data or {} - - async def _deal_user_data( - self, - data: list[dict], - source=False, - ): - if not any(data): - return None - if source: - return data - root, params, logger = self.record.run(self.parameter, type_="user", ) - async with logger(root, name="UserData", console=self.console, **params) as recorder: - data = await self.extractor.run(data, recorder, type_="user") - self.logger.info(_("账号数据已保存至文件")) - return data - - @check_storage_format - async def user_interactive(self, select="", *args, **kwargs): - await self.__secondary_menu( - _("请选择账号链接来源"), - function=self.__function_user, - select=select or safe_pop(self.run_command), - ) - self.logger.info(_("已退出采集账号详细数据模式")) - - # def _enter_search_criteria( - # self, - # text: str = None, - # ) -> None | tuple | bool: - # if not text: - # text = self._inquire_input( - # problem="请输入搜索条件:\n(关键词 搜索类型 页数 排序规则 时间筛选)\n") - # # 分割字符串 - # text = text.split() - # # 如果列表长度小于指定长度,使用空字符串补齐 - # while 0 < len(text) < 5: - # text.append("0") - # return self._verify_search_criteria(*text) - # - # def _verify_search_criteria( - # self, - # keyword: str = None, - # type_: str = None, - # pages: str = None, - # sort: str = None, - # publish: str = None, - # *args, - # ) -> tuple | bool: - # if not keyword: - # return False - # if args: - # return True - # type_ = self.SEARCH["type"].get(type_, 0) - # type_text = self.SEARCH["type_text"][type_] - # pages = self._extract_integer(pages) - # sort = self.SEARCH["sort"].get(sort, 0) - # sort_text = self.SEARCH["sort_text"][sort] - # publish = int(publish) if publish in {"0", "1", "7", "182"} else 0 - # publish_text = self.SEARCH["publish_text"][publish] - # return keyword, (type_, type_text), pages, (sort, - # sort_text), (publish, publish_text) - # - # @staticmethod - # def _extract_integer(page: str) -> int: - # try: - # # 尝试将字符串转换为整数,如果转换成功,则返回比较大的数 - # return max(int(page), 1) - # except ValueError: - # # 如果转换失败,则返回1 - # return 1 - # - # @check_storage_format - # async def search_interactive(self, *args, **kwargs): - # while True: - # if isinstance(c := self._enter_search_criteria(), tuple): - # await self._deal_search_data(*c) - # elif c: - # self.console.print("搜索条件输入格式错误,详细说明请查阅文档!", style=WARNING) - # continue - # else: - # break - # self.logger.info("已退出采集搜索结果数据模式") - # - # @staticmethod - # def _generate_search_name( - # keyword: str, - # type_: str, - # sort: str = None, - # publish: str = None) -> str: - # format_ = ( - # _("搜索数据"), - # f"{datetime.now():%Y_%m_%d_%H_%M_%S}", - # type_, - # keyword.strip(), - # sort, - # publish, - # ) - # if all(format_): - # return "_".join(format_) - # elif all(format_[:3]): - # return "_".join(format_[:3]) - # raise ValueError - - # async def _deal_search_data( - # self, - # keyword: str, - # type_: tuple, - # pages: int, - # sort: tuple, - # publish: tuple, - # source=False, - # cookie: str = None, - # proxy: str = None, - # ): - # search_data = await Search( - # self.parameter, - # cookie, - # proxy, - # keyword, - # type_[0], - # pages, - # sort[0], - # publish[0], - # ).run() - # if not any(search_data): - # # self.logger.warning("采集搜索数据失败") - # return None - # # print(search_data) # 调试代码 - # if source: - # return search_data - # name = self._generate_search_name( - # keyword, type_[1], sort[1], publish[1]) - # root, params, logger = self.record.run(self.parameter, - # type_=self.DATA_TYPE[type_[0]]) - # async with logger(root, name=name, console=self.console, **params) as logger: - # search_data = await self.extractor.run( - # search_data, - # logger, - # type_="search", - # tab=type_[0]) - # self.logger.info(f"搜索数据已保存至 {name}") - # # print(search_data) # 调试代码 - # return search_data - - @check_storage_format - async def hot_interactive(self, *args, ): - await self._deal_hot_data() - self.logger.info(_("已退出采集抖音热榜数据(抖音)模式")) - - async def _deal_hot_data( - self, - source=False, - cookie: str = None, - proxy: str = None, - ): - time_, board = await Hot(self.parameter, cookie, proxy, ).run() - if not any(board): - return None, None - if source: - return time_, [{Hot.board_params[i].name: j} for i, j in board] - root, params, logger = self.record.run(self.parameter, type_="hot") - data = [] - for i, j in board: - name = _("热榜数据_{time}_{name}").format(time=time_, name=Hot.board_params[i].name) - async with logger(root, name=name, console=self.console, **params) as record: - data.append( - {Hot.board_params[i].name: await self.extractor.run(j, record, type_="hot")}) - self.logger.info(_("热榜数据已储存至: 热榜数据_{time} + 榜单类型").format(time=time_)) - # print(time_, data, source) # 调试代码 - return time_, data - - @check_cookie_state(tiktok=False) - async def collection_interactive(self, *args, ): - if isinstance(sec_user_id := await self.__check_owner_url(), str): - start = time() - await self._deal_collection_data( - sec_user_id, - ) - self._time_statistics(start) - self.logger.info(_("已退出批量下载收藏作品(抖音)模式")) - - @check_cookie_state(tiktok=False) - async def collects_interactive(self, *args, key: str = "name", ): - if c := await self.__get_collects_list(key=key, ): - start = time() - for i in c: - await self._deal_collects_data( - i[key], - i["id"], - ) - self._time_statistics(start) - else: - self.logger.info(_("已退出批量下载收藏夹作品(抖音)模式")) - - async def __get_collects_list( - self, - cookie: str = None, - proxy: str | dict = None, - # api=False, - source=False, - key: str = "name", - *args, - **kwargs, - ): - collects = await Collects(self.parameter, cookie, proxy, ).run() - if not any(collects): - return None - if source: - return collects - data = self.extractor.extract_collects_info(collects) - return self.__input_download_index(data, _("收藏夹"), key, ) - - async def __check_owner_url(self, tiktok=False, ): - if not (sec_user_id := await self.check_sec_user_id(self.owner.url)): - self.logger.warning( - _("配置文件 owner_url 的 url 参数 {url} 无效").format(url=self.owner.url), - ) - if self.console.input( - _("程序无法获取账号信息,建议修改配置文件后重新运行,是否返回上一级菜单(YES/NO)") - ).upper != "NO": - return None - return "" - return sec_user_id - - @check_cookie_state(tiktok=False) - async def collection_music_interactive(self, *args, ): - start = time() - if data := await self.__handle_collection_music(*args, ): - data = await self.extractor.run(data, None, "music", ) - await self.downloader.run(data, type_="music", ) - self._time_statistics(start) - self.logger.info(_("已退出批量下载收藏音乐(抖音)模式")) - - def _time_statistics(self, start: float, ): - time_ = time() - start - self.logger.info( - _("程序运行耗时 {minutes} 分钟 {seconds} 秒").format(minutes=int(time_ // 60), seconds=int(time_ % 60))) - - async def __handle_collection_music( - self, - cookie: str = None, - proxy: str = None, - *args, - **kwargs, - ): - data = await CollectsMusic( - self.parameter, - cookie, - proxy, - *args, - **kwargs, - ).run() - return data if any(data) else None - - async def _deal_collection_data( - self, - sec_user_id: str, - api=False, - source=False, - cookie: str = None, - proxy: str = None, - tiktok=False, - ): - self.logger.info(_("开始获取收藏数据")) - if not (info := await self.get_user_info_data( - tiktok, - cookie, - proxy, - sec_user_id=sec_user_id, - )): - self.logger.warning(_("{sec_user_id} 获取账号信息失败").format(sec_user_id=sec_user_id)) - return - collection = await Collection( - self.parameter, - cookie, - proxy, - sec_user_id, - ).run() - if not any(collection): - return None - if source: - return collection - return await self._batch_process_detail( - collection, - api, - tiktok=tiktok, - mode="collection", - mark=self.owner.mark, - user_id=sec_user_id, - info=info, - ) - - async def _deal_collects_data( - self, - name: str, - id_: str, - api=False, - source=False, - cookie: str = None, - proxy: str = None, - tiktok=False, - ): - self.logger.info(_("开始获取收藏夹数据")) - data = await CollectsDetail( - self.parameter, - cookie, - proxy, - id_, - ).run() - if not any(data): - return None - if source: - return data - return await self._batch_process_detail( - data, - mode="collects", - collect_id=id_, - collect_name=name, - api=api, - tiktok=tiktok, - ) - - async def hashtag_interactive( - self, - cookie: str = None, - proxy: str = None, - *args, - **kwargs, - ): - await HashTag(self.parameter, cookie, proxy, ).run() - - async def run(self, run_command: list): - self.run_command = run_command - while self.running: - if not (select := safe_pop(self.run_command)): - select = choose( - _("请选择采集功能"), - [i for i, __ in self.__function], - self.console, - (10,), - ) - if select in {"Q", "q", }: - self.running = False - try: - n = int(select) - 1 - except ValueError: - break - if n in range(len(self.__function)): - await self.__function[n][1](safe_pop(self.run_command)) +import re +from datetime import date +# from datetime import datetime +from pathlib import Path +from platform import system +from time import time +from types import SimpleNamespace +from typing import Callable +from typing import TYPE_CHECKING +from typing import Union + +from ..custom import failure_handling +from ..custom import suspend +from ..downloader import Downloader +from ..extract import Extractor +from ..interface import ( + Account, + AccountTikTok, + Comment, + Detail, + Live, + Collection, + Mix, + Hot, + # Search, + User, + HashTag, + DetailTikTok, + CollectsMix, + LiveTikTok, + MixTikTok, + # CommentTikTok, + Collects, + # CollectsSeries, + CollectsMusic, + CollectsDetail, + Info, + InfoTikTok, +) +from ..interface.user_tiktok import UserTikTok +from ..link import Extractor as LinkExtractor +from ..link import ExtractorTikTok +from ..manager import Cache +from ..storage import RecordManager +from ..tools import TikTokDownloaderError +from ..tools import choose +from ..tools import safe_pop +from ..translation import _ + +if TYPE_CHECKING: + from ..config import Parameter + from ..manager import Database + +__all__ = [ + "TikTok", +] + + +def check_storage_format(function): + async def inner(self, *args, **kwargs): + if self.parameter.storage_format: + return await function(self, *args, **kwargs) + self.console.warning( + _("未设置 storage_format 参数,无法正常使用该功能,详细说明请查阅项目文档!"), + ) + + return inner + + +def check_cookie_state(tiktok=False): + def check_cookie(function): + async def inner(self, *args, **kwargs): + if tiktok: + params = self.parameter.cookie_tiktok_state + tip = "TikTok Cookie" + else: + params = self.parameter.cookie_state + tip = _("抖音 Cookie") + if params: + return await function(self, *args, **kwargs) + self.console.warning( + _("{tip} 未登录,无法使用该功能,详细说明请查阅项目文档!").format(tip=tip), + ) + + return inner + + return check_cookie + + +class TikTok: + ENCODE = "UTF-8-SIG" if system() == "Windows" else "UTF-8" + + def __init__(self, parameter: "Parameter", database: "Database", ): + self.run_command = None + self.parameter = parameter + self.database = database + self.console = parameter.console + self.logger = parameter.logger + self.links = LinkExtractor(parameter) + self.links_tiktok = ExtractorTikTok(parameter) + self.downloader = Downloader(parameter) + self.extractor = Extractor(parameter) + self.storage = bool(parameter.storage_format) + self.record = RecordManager() + self.settings = parameter.settings + self.accounts = parameter.accounts_urls + self.accounts_tiktok = parameter.accounts_urls_tiktok + self.mix = parameter.mix_urls + self.mix_tiktok = parameter.mix_urls_tiktok + self.owner = parameter.owner_url + self.owner_tiktok = parameter.owner_url_tiktok + self.running = True + self.ffmpeg = parameter.ffmpeg.state + self.cache = Cache( + parameter, + self.database, + "mark" in parameter.name_format, + "nickname" in parameter.name_format + ) + self.__function = ( + (_("批量下载账号作品(抖音)"), self.account_acquisition_interactive,), + (_("批量下载链接作品(抖音)"), self.detail_interactive,), + (_("获取直播推流地址(抖音)"), self.live_interactive,), + (_("采集作品评论数据(抖音)"), self.comment_interactive,), + (_("批量下载合集作品(抖音)"), self.mix_interactive,), + (_("采集账号详细数据(抖音)"), self.user_interactive,), + # (_("采集搜索结果数据(抖音)"),), + (_("采集抖音热榜数据(抖音)"), self.hot_interactive,), + # (_("批量下载话题作品(抖音)"),), + (_("批量下载收藏作品(抖音)"), self.collection_interactive,), + (_("批量下载收藏音乐(抖音)"), self.collection_music_interactive,), + # (_("批量下载收藏短剧(抖音)"),), + (_("批量下载收藏夹作品(抖音)"), self.collects_interactive,), + (_("批量下载账号作品(TikTok)"), self.account_acquisition_interactive_tiktok,), + (_("批量下载链接作品(TikTok)"), self.detail_interactive_tiktok,), + (_("获取直播推流地址(TikTok)"), self.live_interactive_tiktok,), + # (_("采集作品评论数据(TikTok)"), self.comment_interactive_tiktok,), + (_("批量下载合集作品(TikTok)"), self.mix_interactive_tiktok,), + (_("采集账号详细数据(TikTok)"), self.user_interactive_tiktok,), + ) + self.__function_account = ( + (_("使用 accounts_urls 参数的账号链接(推荐)"), self.account_detail_batch), + (_("手动输入待采集的账号链接"), self.account_detail_inquire), + (_("从文本文档读取待采集的账号链接"), self.account_detail_txt), + ) + self.__function_account_tiktok = ( + (_("使用 accounts_urls_tiktok 参数的账号链接(推荐)"), self.account_detail_batch_tiktok), + (_("手动输入待采集的账号链接"), self.account_detail_inquire_tiktok), + (_("从文本文档读取待采集的账号链接"), self.account_detail_txt_tiktok), + ) + self.__function_mix = ( + (_("使用 mix_urls 参数的合集链接(推荐)"), self.mix_batch), + (_("获取当前账号收藏合集列表"), self.mix_collection), + (_("手动输入待采集的合集/作品链接"), self.mix_inquire), + (_("从文本文档读取待采集的合集/作品链接"), self.mix_txt), + ) + self.__function_mix_tiktok = ( + (_("使用 mix_urls_tiktok 参数的合集链接(推荐)"), self.mix_batch_tiktok), + (_("手动输入待采集的合集/作品链接"), self.mix_inquire_tiktok), + (_("从文本文档读取待采集的合集/作品链接"), self.mix_txt_tiktok), + ) + self.__function_user = ( + (_("使用 accounts_urls 参数的账号链接"), self.user_batch), + (_("手动输入待采集的账号链接"), self.user_inquire), + (_("从文本文档读取待采集的账号链接"), self.user_txt), + ) + self.__function_user_tiktok = ( + (_("使用 accounts_urls_tiktok 参数的账号链接"), self.user_batch_tiktok), + (_("手动输入待采集的账号链接"), self.user_inquire_tiktok), + (_("从文本文档读取待采集的账号链接"), self.user_txt_tiktok), + ) + self.__function_detail = ( + (_("手动输入待采集的作品链接"), self.__detail_inquire), + (_("从文本文档读取待采集的作品链接"), self.__detail_txt), + ) + self.__function_detail_tiktok = ( + (_("手动输入待采集的作品链接"), self.__detail_inquire_tiktok), + (_("从文本文档读取待采集的作品链接"), self.__detail_txt_tiktok), + ) + self.__function_comment = ( + (_("手动输入待采集的作品链接"), self.__comment_inquire), + (_("从文本文档读取待采集的作品链接"), self.__comment_txt), + ) + self.__function_comment_tiktok = ( + (_("手动输入待采集的作品链接"), self.__comment_inquire_tiktok), + # (_("从文本文档读取待采集的作品链接"), self.__comment_txt_tiktok), + ) + + def _inquire_input(self, tip: str = "", problem: str = "", ) -> str: + text = self.console.input(problem or _("请输入{tip}链接: ").format(tip=tip)) + if not text: + return "" + elif text.upper() == "Q": + self.running = False + return "" + return text + + async def account_acquisition_interactive_tiktok( + self, + select="", + ): + await self.__secondary_menu( + _("请选择账号链接来源"), + function=self.__function_account_tiktok, + select=select or safe_pop(self.run_command), + ) + self.logger.info(_("已退出批量下载账号作品(TikTok)模式")) + + def __summarize_results(self, count: SimpleNamespace, name=_("账号")): + time_ = time() - count.time + self.logger.info( + _("程序共处理 {0} 个{1},成功 {2} 个,失败 {3} 个,耗时 {4} 分钟 {5} 秒").format( + count.success + count.failed, + name, + count.success, + count.failed, + int(time_ // 60), + int(time_ % 60), + )) + + async def account_acquisition_interactive( + self, + select="", + ): + await self.__secondary_menu( + _("请选择账号链接来源"), + function=self.__function_account, + select=select or safe_pop(self.run_command), + ) + self.logger.info(_("已退出批量下载账号作品(抖音)模式")) + + async def __secondary_menu( + self, + problem=_("请选择账号链接来源"), + function=..., + select: str | int = ..., + *args, + **kwargs, + ): + if not select: + select = choose( + problem, + [i[0] for i in function], + self.console, + ) + if select.upper() == "Q": + self.running = False + try: + n = int(select) - 1 + except ValueError: + return + if n in range(len(function)): + await function[n][1](*args, **kwargs, ) + + async def account_detail_batch(self, *args, ): + await self.__account_detail_batch( + self.accounts, + "accounts_urls", + False, + ) + + async def account_detail_batch_tiktok(self, *args, ): + await self.__account_detail_batch( + self.accounts_tiktok, + "accounts_urls_tiktok", + True, + ) + + async def __account_detail_batch( + self, + accounts: list[SimpleNamespace], + params_name: str, + tiktok: bool, + ) -> None: + count = SimpleNamespace(time=time(), success=0, failed=0) + self.logger.info(_("共有 {count} 个账号的作品等待下载").format(count=len(accounts))) + for index, data in enumerate(accounts, start=1): + if hasattr(data, "enable") and not data.enable: + continue + if not (sec_user_id := await self.check_sec_user_id( + data.url, + tiktok, + )): + self.logger.warning( + _("配置文件 {name} 参数第 {index} 条数据的 url {url} 错误,提取 sec_user_id 失败").format( + name=params_name, + index=index, + url=data.url, + )) + count.failed += 1 + continue + if not await self.deal_account_detail( + index, + **vars(data) | {"sec_user_id": sec_user_id}, + tiktok=tiktok, + ): + count.failed += 1 + continue + # break # 调试代码 + count.success += 1 + if index != len(accounts): + await suspend(index, self.console) + self.__summarize_results(count, _("账号"), ) + + async def check_sec_user_id(self, sec_user_id: str, tiktok=False, ) -> str: + match tiktok: + case True: + sec_user_id = await self.links_tiktok.run(sec_user_id, "user") + case False: + sec_user_id = await self.links.run(sec_user_id, "user") + return sec_user_id[0] if len(sec_user_id) > 0 else "" + + async def account_detail_inquire(self, *args, ): + while url := self._inquire_input(_("账号主页")): + links = await self.links.run(url, "user") + if not links: + self.logger.warning(_("{url} 提取账号 sec_user_id 失败").format(url=url)) + continue + await self.__account_detail_handle(links, False, *args, ) + + async def account_detail_inquire_tiktok(self, *args, ): + while url := self._inquire_input(_("账号主页")): + links = await self.links_tiktok.run(url, "user") + if not links: + self.logger.warning(_("{url} 提取账号 sec_user_id 失败").format(url=url)) + continue + await self.__account_detail_handle(links, True, *args, ) + + async def account_detail_txt(self, ): + await self._read_from_txt( + tiktok=False, + type_="user", + error=_("从文本文档提取账号 sec_user_id 失败"), + callback=self.__account_detail_handle, + ) + + async def _read_from_txt( + self, + tiktok=False, + type_: str = ..., + error: str = ..., + callback: Callable = ..., + *args, + **kwargs, + ): + if not (url := self.txt_inquire()): + return + link_obj = self.links_tiktok if tiktok else self.links + links = await link_obj.run(url, type_, ) + if not links or not isinstance(links[0], bool | None): + links = [links] + if not links[-1]: + self.logger.warning(error) + return + await callback(*links, *args, tiktok=tiktok, **kwargs, ) + + async def account_detail_txt_tiktok(self, ): + await self._read_from_txt( + tiktok=True, + type_="user", + error=_("从文本文档提取账号 sec_user_id 失败"), + callback=self.__account_detail_handle, + ) + + async def __account_detail_handle( + self, + links, + tiktok=False, + *args, + **kwargs, + ): + count = SimpleNamespace(time=time(), success=0, failed=0) + for index, sec in enumerate(links, start=1): + if not await self.deal_account_detail( + index, + sec_user_id=sec, + tiktok=tiktok, + *args, + **kwargs, + ): + count.failed += 1 + continue + count.success += 1 + if index != len(links): + await suspend(index, self.console) + self.__summarize_results(count, _("账号"), ) + + async def deal_account_detail( + self, + index: int, + sec_user_id: str, + mark="", + tab="post", + earliest="", + latest="", + pages: int = None, + api=False, + source=False, + cookie: str = None, + proxy: str = None, + tiktok=False, + *args, + **kwargs, + ): + self.logger.info(_("开始处理第 {index} 个账号").format(index=index) if index else _("开始处理账号")) + info = None + if tab in { + "favorite", + "collection", + }: + if not (info := await self.get_user_info_data( + tiktok, + cookie, + proxy, + sec_user_id=sec_user_id, + )): + self.logger.warning(_("{sec_user_id} 获取账号信息失败").format(sec_user_id=sec_user_id)) + return + acquirer = self._get_account_data_tiktok if tiktok else self._get_account_data + account_data, earliest, latest = await acquirer( + cookie=cookie, + proxy=proxy, + sec_user_id=sec_user_id, + tab=tab, + earliest=earliest, + latest=latest, + pages=pages, + ) + if not any(account_data): + return None + if source: + return self.extractor.source_date_filter( + account_data, + earliest, + latest, + tiktok, + ) + return await self._batch_process_detail( + account_data, + user_id=sec_user_id, + mark=mark, + api=api, + earliest=earliest, + latest=latest, + tiktok=tiktok, + mode=tab, + info=info, + ) + + async def _get_account_data( + self, + cookie: str = None, + proxy: str = None, + sec_user_id: Union[str] = ..., + tab: str = "post", + earliest: str = "", + latest: str = "", + pages: int = None, + *args, + **kwargs, + ): + return await Account( + self.parameter, + cookie, + proxy, + sec_user_id, + tab, + earliest, + latest, + pages, + ).run() + + async def _get_account_data_tiktok( + self, + cookie: str = None, + proxy: str = None, + sec_user_id: Union[str] = ..., + tab: str = "post", + earliest: str = "", + latest: str = "", + pages: int = None, + *args, + **kwargs, + ): + return await AccountTikTok( + self.parameter, + cookie, + proxy, + sec_user_id, + tab, + earliest, + latest, + pages, + ).run() + + async def get_user_info_data( + self, + tiktok=False, + cookie: str = None, + proxy: str = None, + unique_id: Union[str] = "", + sec_user_id: Union[str] = "", + ): + return ( + await self._get_info_data_tiktok( + cookie, + proxy, + unique_id, + sec_user_id, + ) + if tiktok + else await self._get_info_data( + cookie, + proxy, + sec_user_id, + ) + ) + + async def _get_info_data( + self, + cookie: str = None, + proxy: str = None, + sec_user_id: Union[str, list[str]] = ..., + ): + return await Info( + self.parameter, + cookie, + proxy, + sec_user_id, + ).run() + + async def _get_info_data_tiktok( + self, + cookie: str = None, + proxy: str = None, + unique_id: Union[str] = "", + sec_user_id: Union[str] = "", + ): + return await InfoTikTok( + self.parameter, + cookie, + proxy, + unique_id, + sec_user_id, + ).run() + + async def _batch_process_detail( + self, + data: list[dict], + api: bool = False, + earliest: date = None, + latest: date = None, + tiktok: bool = False, + info: dict = None, + mode: str = "", + mark: str = "", + user_id: str = "", + mix_id: str = "", + mix_title: str = "", + collect_id: str = "", + collect_name: str = "", + ): + self.logger.info(_("开始提取作品数据")) + id_, name, mark = self.extractor.preprocessing_data( + info or data, + tiktok, + mode, + mark, + user_id, + mix_id, + mix_title, + collect_id, + collect_name, + ) + self.__display_extracted_information(id_, name, mark, ) + prefix = self._generate_prefix(mode) + suffix = self._generate_suffix(mode) + old_mark = f"{m["MARK"]}_{suffix}" if ( + m := await self.cache.has_cache(id_) + ) else None + root, params, logger = self.record.run(self.parameter) + async with logger( + root, + name=f"{prefix}{id_}_{mark}_{suffix}", + old=old_mark, + console=self.console, + **params, + ) as recorder: + data = await self.extractor.run( + data, + recorder, + type_="batch", + tiktok=tiktok, + name=name, + mark=mark, + earliest=earliest or date(2016, 9, 20), + latest=latest or date.today(), + same=mode in { + "post", + "mix", + }, + ) + if api: + return data + await self.cache.update_cache( + self.parameter.folder_mode, + prefix, + suffix, + id_, + name, + mark, + ) + await self.download_detail_batch( + data, + tiktok=tiktok, + mode=mode, + mark=mark, + user_id=id_, + user_name=name, + mix_id=mix_id, + mix_title=mix_title, + collect_id=collect_id, + collect_name=collect_name, + ) + return True + + @staticmethod + def _generate_prefix(mode: str, ): + match mode: + case "post" | "favorite" | "collection": + return "UID" + case "mix": + return "MID" + case "collects": + return "CID" + case _: + raise TikTokDownloaderError + + @staticmethod + def _generate_suffix(mode: str, ): + match mode: + case "post": + return _("发布作品") + case "favorite": + return _("喜欢作品") + case "collection": + return _("收藏作品") + case "mix": + return _("合集作品") + case "collects": + return _("收藏夹作品") + case _: + raise TikTokDownloaderError + + def __display_extracted_information( + self, + id_: str, + name: str, + mark: str, + ) -> None: + self.logger.info(_("昵称/标题:{name};标识:{mark};ID:{id}").format( + name=name, + mark=mark, + id=id_, + ), ) + + async def download_detail_batch( + self, + data: list[dict], + type_: str = "batch", + tiktok: bool = False, + mode: str = "", + mark: str = "", + user_id: str = "", + user_name: str = "", + mix_id: str = "", + mix_title: str = "", + collect_id: str = "", + collect_name: str = "", + ): + await self.downloader.run( + data, + type_, + tiktok, + mode=mode, + mark=mark, + user_id=user_id, + user_name=user_name, + mix_id=mix_id, + mix_title=mix_title, + collect_id=collect_id, + collect_name=collect_name, + ) + + async def detail_interactive(self, select="", ): + await self.__secondary_menu( + _("请选择作品链接来源"), + self.__function_detail, + select or safe_pop(self.run_command), + ) + self.logger.info(_("已退出批量下载链接作品(抖音)模式")) + + async def detail_interactive_tiktok(self, select="", ): + await self.__detail_secondary_menu( + self.__function_detail_tiktok, + select or safe_pop(self.run_command), + ) + self.logger.info(_("已退出批量下载链接作品(TikTok)模式")) + + async def __detail_secondary_menu(self, menu, select="", *args, **kwargs): + root, params, logger = self.record.run(self.parameter) + async with logger(root, console=self.console, **params) as record: + if not select: + select = choose( + _("请选择作品链接来源"), + [i[0] for i in menu], + self.console, + ) + if select.upper() == "Q": + self.running = False + try: + n = int(select) - 1 + except ValueError: + return + if n in range(len(menu)): + await menu[n][1](record) + + async def __detail_inquire(self, tiktok=False, ): + root, params, logger = self.record.run(self.parameter) + link_obj = self.links_tiktok if tiktok else self.links + async with logger(root, console=self.console, **params) as record: + while url := self._inquire_input(_("作品")): + ids = await link_obj.run(url) + if not any(ids): + self.logger.warning(_("{url} 提取作品 ID 失败").format(url=url)) + continue + self.console.print(_("共提取到 {count} 个作品,开始处理!").format(count=len(ids))) + await self._handle_detail(ids, tiktok, record, ) + + async def __detail_inquire_tiktok(self, tiktok=True, ): + await self.__detail_inquire(tiktok, ) + + async def __detail_txt(self, tiktok=False, ): + root, params, logger = self.record.run(self.parameter) + async with logger(root, console=self.console, **params) as record: + await self._read_from_txt( + tiktok, + "detail", + _("从文本文档提取作品 ID 失败"), + self._handle_detail, + record=record, + ) + + async def __detail_txt_tiktok(self, tiktok=True, ): + await self.__detail_txt(tiktok=tiktok, ) + + async def __read_detail_txt(self): + if not (url := self.txt_inquire()): + return + ids = await self.links.run(url) + if not any(ids): + self.logger.warning(_("从文本文档提取作品 ID 失败")) + return + self.console.print(_("共提取到 {count} 个作品,开始处理!").format(count=len(ids))) + return ids + + async def _handle_detail( + self, + ids: list[str], + tiktok: bool, + record, + api=False, + source=False, + cookie: str = None, + proxy: str = None, + ): + obj = DetailTikTok if tiktok else Detail + return await self.__handle_detail( + tiktok, + obj, + ids, + record, + api=api, + source=source, + cookie=cookie, + proxy=proxy, + ) + + async def __handle_detail( + self, + tiktok: bool, + request_obj: Callable, + ids: list[str], + record, + api=False, + source=False, + cookie: str = None, + proxy: str = None, + ): + detail_data = [ + await request_obj( + self.parameter, + cookie, + proxy, + i, + ).run() for i in ids + ] + if not any(detail_data): + return None + if source: + return detail_data + detail_data = await self.extractor.run(detail_data, record, tiktok=tiktok, ) + if api: + return detail_data + await self.downloader.run(detail_data, "detail", tiktok=tiktok) + return self._get_preview_image(detail_data[0]) + + @staticmethod + def _get_preview_image(data: dict) -> str: + if data["type"] == _("图集"): + return data["downloads"][0] + elif data["type"] == _("视频"): + return data["origin_cover"] + return "" + + def _choice_live_quality( + self, + flv_items: dict, + m3u8_items: dict, + ) -> tuple | None: + if not self.ffmpeg: + self.logger.warning(_("程序未检测到有效的 ffmpeg,不支持直播下载功能!")) + return None + try: + choice_ = self.console.input( + _("请选择下载清晰度(输入清晰度或者对应序号,直接回车代表不下载): "), + ) + if u := flv_items.get(choice_): + return u, m3u8_items.get(choice_) + if not 0 <= (i := int(choice_) - 1) < len(flv_items): + raise ValueError + except ValueError: + return None + return list(flv_items.values())[i], list(m3u8_items.values())[i] + + async def live_interactive( + self, + cookie: str = None, + proxy: str = None, + *args, + ): + while url := self._inquire_input(_("直播")): + params = self._generate_live_params(*await self.links.run(url, type_="live")) + if not params: + self.logger.warning(_("{} 提取直播 ID 失败").format(url=url)) + continue + live_data = [await Live(self.parameter, cookie, proxy, **i).run() for i in params] + if not [i for i in live_data if i]: + self.logger.warning(_("获取直播数据失败")) + continue + live_data = await self.extractor.run(live_data, None, "live") + download_tasks = self.show_live_info(live_data) + await self.downloader.run(download_tasks, type_="live") + self.logger.info(_("已退出获取直播推流地址(抖音)模式")) + + async def live_interactive_tiktok( + self, + cookie: str = None, + proxy: str = None, + *args, + ): + while url := self._inquire_input(_("直播")): + _, ids = await self.links_tiktok.run(url, type_="live") + if not ids: + self.logger.warning(_("{} 提取直播 ID 失败").format(url=url)) + continue + live_data = [await LiveTikTok(self.parameter, cookie, proxy, i).run() for i in ids] + if not [i for i in live_data if i]: + self.logger.warning_(_("获取直播数据失败")) + continue + live_data = await self.extractor.run(live_data, None, "live", tiktok=True, ) + download_tasks = self.show_live_info_tiktok(live_data) + await self.downloader.run(download_tasks, type_="live", tiktok=True) + self.logger.info(_("已退出获取直播推流地址(TikTok)模式")) + + def _generate_live_params(self, rid: bool, ids: list[list]) -> list[dict]: + if not ids: + self.console.warning(_("提取 web_rid 或者 room_id 失败!"), ) + return [] + if rid: + return [{"web_rid": id_} for id_ in ids] + else: + return [{"room_id": id_[0], "sec_user_id": id_[1]} for id_ in ids] + + def show_live_info(self, data: list[dict]) -> list[tuple]: + download_tasks = [] + for item in data: + self.console.print(_("直播标题:"), item["title"]) + self.console.print(_("主播昵称:"), item["nickname"]) + self.console.print(_("在线观众:"), item["user_count_str"]) + self.console.print(_("观看次数:"), item["total_user_str"]) + if item["status"] == 4: + self.console.print(_("当前直播已结束!")) + continue + self.show_live_stream_url(item, download_tasks) + return [i for i in download_tasks if isinstance(i, tuple)] + + def show_live_info_tiktok(self, data: list[dict]) -> list[tuple]: + download_tasks = [] + for item in data: + if item["message"]: + self.console.print(item["message"]) + self.console.print(item["prompts"]) + continue + self.console.print(_("直播标题:"), item["title"]) + self.console.print(_("主播昵称:"), item["nickname"]) + self.console.print(_("开播时间:"), item["create_time"]) + self.console.print(_("在线观众:"), item["user_count"]) + self.console.print(_("点赞次数:"), item["like_count"]) + self.show_live_stream_url_tiktok(item, download_tasks) + self.console.print("TikTok 直播下载功能尚未完成!") # 完成后移除 + return [i for i in download_tasks if isinstance(i, tuple)] + + def show_live_stream_url(self, item: dict, tasks: list): + self.console.print(_("FLV 推流地址: ")) + for i, (k, v) in enumerate(item["flv_pull_url"].items(), start=1): + self.console.print(i, k, v) + self.console.print(_("M3U8 推流地址: ")) + for i, (k, v) in enumerate(item["hls_pull_url_map"].items(), start=1): + self.console.print(i, k, v) + if self.parameter.download: + tasks.append( + ( + item, + *u + ) if ( + u := self._choice_live_quality( + item["flv_pull_url"], + item["hls_pull_url_map"], + )) else u + ) + + def show_live_stream_url_tiktok(self, item: dict, tasks: list): + self.console.print(_("FLV 推流地址: ")) + for i, (k, v) in enumerate(item["flv_pull_url"].items(), start=1): + self.console.print(i, k, v) + # TODO: TikTok 直播下载功能异常,代理错误 + # if self.parameter.download: + # tasks.append( + # ( + # item, + # *u, + # ) if ( + # # TikTok 平台 暂无 m3u8 地址 + # u := self._choice_live_quality( + # item["flv_pull_url"], + # item["flv_pull_url"], + # ) + # ) else u + # ) + + @check_storage_format + async def comment_interactive_tiktok(self, select="", *args, **kwargs): + ... + self.logger.info(_("已退出采集作品评论数据(TikTok)模式")) + + @check_storage_format + async def comment_interactive(self, select="", ): + await self.__secondary_menu( + _("请选择作品链接来源"), + self.__function_comment, + select or safe_pop(self.run_command), + ) + self.logger.info(_("已退出采集作品评论数据(抖音)模式)")) + + async def __comment_inquire( + self, + tiktok=False, + ): + link = self.links_tiktok if tiktok else self.links + while url := self._inquire_input(_("作品")): + ids = await link.run(url, ) + if not any(ids): + self.logger.warning(_("{url} 提取作品 ID 失败").format(url=url)) + continue + self.console.print(_("共提取到 {count} 个作品,开始处理!").format(count=len(ids))) + await self.__comment_handle( + ids, + tiktok=tiktok, + ) + + async def __comment_inquire_tiktok(self, ): + await self.__comment_inquire(True, ) + + async def __comment_txt(self, tiktok=False, ): + await self._read_from_txt( + tiktok, + "detail", + _("从文本文档提取作品 ID 失败"), + self.__comment_handle, + ) + + async def __comment_handle( + self, + ids: list, + tiktok=False, + cookie: str = None, + proxy: str = None, + ): + if tiktok: # TODO: 代码未完成 + ... + else: + for i in ids: + name = _("作品{id}_评论数据").format(id=i) + root, params, logger = self.record.run(self.parameter, type_="comment") + async with logger(root, name=name, console=self.console, **params) as record: + if d := await Comment( + self.parameter, + cookie, + proxy, + item_id=i, + reply=False, + ).run(): + await self.extractor.run(d, record, type_="comment") + self.logger.info(_("作品评论数据已储存至 {filename}").format(filename=name)) + else: + self.logger.warning(_("采集评论数据失败")) + + async def mix_interactive(self, select="", ): + await self.__secondary_menu( + _("请选择合集链接来源"), + self.__function_mix, + select or safe_pop(self.run_command), + ) + self.logger.info(_("已退出批量下载合集作品(抖音)模式")) + + async def mix_interactive_tiktok(self, select="", ): + await self.__secondary_menu( + _("请选择合集链接来源"), + self.__function_mix_tiktok, + select or safe_pop(self.run_command), + ) + self.logger.info(_("已退出批量下载合集作品(TikTok)模式")) + + @staticmethod + def _generate_mix_params(mix: bool, id_: str) -> dict: + return {"mix_id": id_, } if mix else {"detail_id": id_, } + + async def mix_inquire(self, ): + while url := self._inquire_input(_("合集或作品")): + mix_id, ids = await self.links.run(url, type_="mix") + if not ids: + self.logger.warning(_("{url} 获取作品 ID 或合集 ID 失败").format(url=url)) + continue + await self.__mix_handle(mix_id, ids, ) + + async def mix_inquire_tiktok(self, ): + while url := self._inquire_input(_("合集或作品")): + __, ids, title = await self.links_tiktok.run(url, type_="mix") + if not ids: + self.logger.warning(_("{url} 获取合集 ID 失败").format(url=url)) + continue + await self.__mix_handle(True, ids, title, True, ) + + @check_cookie_state(tiktok=False) + async def mix_collection(self, ): + if id_ := await self.mix_inquire_collection(): + await self.__mix_handle(True, id_, ) + + async def mix_inquire_collection(self) -> list[str]: + data = await CollectsMix(self.parameter).run() + if not any(data): + return [] + data = self.extractor.extract_mix_collect_info(data) + return self.input_download_index(data) + + def input_download_index(self, data: list[dict]) -> list[str] | None: + if d := self.__input_download_index(data, _("收藏合集"), ): + return [i["id"] for i in d] + + def __input_download_index( + self, + data: list[dict], + text=_("收藏合集"), + key="title", + ) -> list[dict] | None: + self.console.print(_("{text}列表:").format(text=_(text))) + for i, j in enumerate(data, start=1): + self.console.print(f"{i}. {j[key]}") + index = self.console.input( + _("请输入需要下载的{item}序号(多个序号使用空格分隔,输入 ALL 下载全部{item}):").format(item=text)) + try: + if not index: + pass + elif index.upper() == "ALL": + return data + elif index.upper() == "Q": + self.running = False + else: + index = {int(i) for i in index.split()} + return [j for i, j in enumerate(data, start=1) if i in index] + except ValueError: + self.console.warning(_("{text}序号输入错误!").format(text=text)) + + async def mix_txt(self, ): + await self._read_from_txt( + tiktok=False, + type_="mix", + error=_("从文本文档提取作品 ID 或合集 ID 失败"), + callback=self.__mix_handle, + ) + + async def mix_txt_tiktok(self, ): + await self._read_from_txt( + tiktok=True, + type_="mix", + error=_("从文本文档提取合集 ID 失败"), + callback=self.__mix_handle, + ) + + if not (url := self.txt_inquire()): + return + __, ids, title = await self.links_tiktok.run(url, type_="mix") + if not ids: + self.logger.warning() + return + await self.__mix_handle(True, ids, title, True, ) + + async def __mix_handle( + self, + mix_id: bool, + ids: list[str], + mix_title_map: list[str] = None, + tiktok=False, + ): + count = SimpleNamespace(time=time(), success=0, failed=0) + for index, i in enumerate(ids, start=1): + if not await self._deal_mix_detail( + mix_id, + i, + index=index, + tiktok=tiktok, + mix_title=mix_title_map[index - 1] if mix_title_map else None, + ): + count.failed += 1 + if index != len(ids) and failure_handling(): + continue + break + count.success += 1 + if index != len(ids): + await suspend(index, self.console) + self.__summarize_results(count, _("合集"), ) + + async def mix_batch(self, ): + await self.__mix_batch( + self.mix, + "mix_urls", + False, + ) + + async def mix_batch_tiktok(self, ): + await self.__mix_batch( + self.mix_tiktok, + "mix_urls_tiktok", + True, + ) + + async def __mix_batch( + self, + mix: list[SimpleNamespace], + params_name: str, + tiktok: bool, + ): + count = SimpleNamespace(time=time(), success=0, failed=0) + for index, data in enumerate(mix, start=1): + if hasattr(data, "enable") and not data.enable: + continue + mix_id, id_, title = await self._check_mix_id(data.url, tiktok, ) + if not id_: + self.logger.warning( + _("配置文件 {name} 参数第 {index} 条数据的 url {url} 错误,获取作品 ID 或合集 ID 失败").format( + name=params_name, + index=index, + url=data.url, + )) + count.failed += 1 + continue + if not await self._deal_mix_detail( + mix_id, + id_, + data.mark, + index, + tiktok=tiktok, + mix_title=title, + ): + count.failed += 1 + continue + count.success += 1 + if index != len(mix): + await suspend(index, self.console) + self.__summarize_results(count, _("合集"), ) + + async def _deal_mix_detail( + self, + mix_id: bool = None, + id_: str = None, + mark="", + index: int = 0, + api=False, + source=False, + cookie: str = None, + proxy: str = None, + tiktok=False, + mix_title: str = "", + ): + self.logger.info(_("开始处理第 {index} 个合集").format(index=index) if index else _("开始处理合集")) + mix_params = self._generate_mix_params(mix_id, id_) + if tiktok: + mix_obj = MixTikTok( + self.parameter, + cookie, + proxy, + mix_title=mix_title, + **mix_params, + ) + else: + mix_obj = Mix( + self.parameter, + cookie, + proxy, + **mix_params, + ) + if any( + mix_data := await mix_obj.run() + ): + return ( + mix_data + if source + else await self._batch_process_detail( + mix_data, + mode="mix", + mix_id=mix_obj.mix_id, + mark=mark, + api=api, + tiktok=tiktok, + ) + ) + self.logger.warning(_("采集合集作品数据失败")) + + async def _check_mix_id(self, url: str, tiktok: bool, ) -> tuple[bool, str, str]: + match tiktok: + case True: + _, ids, title = await self.links_tiktok.run(url, type_="mix") + return (True, ids[0], title[0]) if len(ids) > 0 else (None, "", "") + case False: + mix_id, ids = await self.links.run(url, type_="mix") + return (mix_id, ids[0], "") if len(ids) > 0 else (mix_id, "", "") + + async def user_batch(self, *args, **kwargs, ): + users = [] + for index, data in enumerate(self.accounts, start=1): + if not (sec_user_id := await self.check_sec_user_id(data.url)): + self.logger.warning( + _("配置文件 accounts_urls 参数第 {index} 条数据的 url 无效").format(index=index), + ) + continue + users.append(await self._get_user_data(sec_user_id=sec_user_id)) + await self._deal_user_data([i for i in users if i]) + async def get_uniqueId(self, url): + # 使用正则表达式匹配用户名 + match = re.search(r'https://www\.tiktok\.com/@([^/]+)', url) + if match: + return match.group(1) + else: + raise ValueError("Invalid TikTok URL format") + async def user_batch_tiktok(self, *args, **kwargs, ): + users = [] + for index, data in enumerate(self.accounts_tiktok, start=1): + if not (uniqueId := await self.get_uniqueId(data.url)): + self.logger.warning( + _("配置文件 accounts_urls_tiktok 参数第 {index} 条数据的 url 无效").format(index=index), + ) + continue + users.append(await self._get_user_data(uniqueId=uniqueId, tiktok=True)) + await self._deal_user_data([i for i in users if i], type="user_tiktok", tiktok=True) + + async def user_inquire(self, *args, **kwargs, ): + while url := self._inquire_input(_("账号主页")): + sec_user_ids = await self.links.run(url, type_="user") + if not sec_user_ids: + self.logger.warning(_("{url} 提取账号 sec_user_id 失败").format(url=url)) + continue + users = [await self._get_user_data(i) for i in sec_user_ids] + await self._deal_user_data([i for i in users if i]) + + async def user_inquire_tiktok(self, *args, **kwargs, ): + pass + + def txt_inquire(self) -> str: + if path := self.console.input(_("请输入文本文档路径:")): + if (t := Path(path.replace("\"", ""))).is_file(): + try: + with t.open("r", encoding=self.ENCODE) as f: + return f.read() + except UnicodeEncodeError as e: + self.logger.warning(_("{path} 文件读取异常: {error}").format(path=path, error=e)) + else: + self.console.print(_("{path} 文件不存在!").format(path=path)) + return "" + + async def user_txt(self, *args, **kwargs, ): + if not (url := self.txt_inquire()): + return + sec_user_ids = await self.links.run(url, type_="user") + if not sec_user_ids: + self.logger.warning(_("从文本文档提取账号 sec_user_id 失败")) + return + users = [await self._get_user_data(i) for i in sec_user_ids] + await self._deal_user_data([i for i in users if i]) + + async def user_txt_tiktok(self, *args, **kwargs, ): + pass + + async def _get_user_data( + self, + sec_user_id: str = None, + uniqueId: str = None, + cookie: str = None, + proxy: str = None, + tiktok: bool = False, + + ): + account_id = sec_user_id if sec_user_id else uniqueId + self.logger.info(_("正在获取账号 {account_id} 的数据").format(account_id=account_id)) + if tiktok: + data = await UserTikTok(self.parameter, cookie, proxy, sec_user_id, "userInfo", uniqueId).run() + else: + data = await User(self.parameter, cookie, proxy, sec_user_id, "user" ).run() + return data or {} + + async def _deal_user_data( + self, + data: list[dict], + source=False, + type: str = "user", + tiktok: bool = False, + ): + if not any(data): + return None + if source: + return data + root, params, logger = self.record.run(self.parameter, type_=type, ) + # TODO + if type == "user": + data_name = "UserData" + else: + data_name = "UserTikTokData" + async with logger(root, name=data_name, console=self.console, **params) as recorder: + data = await self.extractor.run(data, recorder, type_="user", tiktok=tiktok) + self.logger.info(_("账号数据已保存至文件")) + return data + + @check_storage_format + async def user_interactive(self, select="", *args, **kwargs): + await self.__secondary_menu( + _("请选择账号链接来源"), + function=self.__function_user, + select=select or safe_pop(self.run_command), + ) + self.logger.info(_("已退出采集账号详细数据模式")) + async def user_interactive_tiktok(self, select="", *args, **kwargs): + await self.__secondary_menu( + _("请选择账号链接来源"), + function=self.__function_user_tiktok, + select=select or safe_pop(self.run_command), + ) + self.logger.info(_("已退出采集账号详细数据模式")) + + # def _enter_search_criteria( + # self, + # text: str = None, + # ) -> None | tuple | bool: + # if not text: + # text = self._inquire_input( + # problem="请输入搜索条件:\n(关键词 搜索类型 页数 排序规则 时间筛选)\n") + # # 分割字符串 + # text = text.split() + # # 如果列表长度小于指定长度,使用空字符串补齐 + # while 0 < len(text) < 5: + # text.append("0") + # return self._verify_search_criteria(*text) + # + # def _verify_search_criteria( + # self, + # keyword: str = None, + # type_: str = None, + # pages: str = None, + # sort: str = None, + # publish: str = None, + # *args, + # ) -> tuple | bool: + # if not keyword: + # return False + # if args: + # return True + # type_ = self.SEARCH["type"].get(type_, 0) + # type_text = self.SEARCH["type_text"][type_] + # pages = self._extract_integer(pages) + # sort = self.SEARCH["sort"].get(sort, 0) + # sort_text = self.SEARCH["sort_text"][sort] + # publish = int(publish) if publish in {"0", "1", "7", "182"} else 0 + # publish_text = self.SEARCH["publish_text"][publish] + # return keyword, (type_, type_text), pages, (sort, + # sort_text), (publish, publish_text) + # + # @staticmethod + # def _extract_integer(page: str) -> int: + # try: + # # 尝试将字符串转换为整数,如果转换成功,则返回比较大的数 + # return max(int(page), 1) + # except ValueError: + # # 如果转换失败,则返回1 + # return 1 + # + # @check_storage_format + # async def search_interactive(self, *args, **kwargs): + # while True: + # if isinstance(c := self._enter_search_criteria(), tuple): + # await self._deal_search_data(*c) + # elif c: + # self.console.print("搜索条件输入格式错误,详细说明请查阅文档!", style=WARNING) + # continue + # else: + # break + # self.logger.info("已退出采集搜索结果数据模式") + # + # @staticmethod + # def _generate_search_name( + # keyword: str, + # type_: str, + # sort: str = None, + # publish: str = None) -> str: + # format_ = ( + # _("搜索数据"), + # f"{datetime.now():%Y_%m_%d_%H_%M_%S}", + # type_, + # keyword.strip(), + # sort, + # publish, + # ) + # if all(format_): + # return "_".join(format_) + # elif all(format_[:3]): + # return "_".join(format_[:3]) + # raise ValueError + + # async def _deal_search_data( + # self, + # keyword: str, + # type_: tuple, + # pages: int, + # sort: tuple, + # publish: tuple, + # source=False, + # cookie: str = None, + # proxy: str = None, + # ): + # search_data = await Search( + # self.parameter, + # cookie, + # proxy, + # keyword, + # type_[0], + # pages, + # sort[0], + # publish[0], + # ).run() + # if not any(search_data): + # # self.logger.warning("采集搜索数据失败") + # return None + # # print(search_data) # 调试代码 + # if source: + # return search_data + # name = self._generate_search_name( + # keyword, type_[1], sort[1], publish[1]) + # root, params, logger = self.record.run(self.parameter, + # type_=self.DATA_TYPE[type_[0]]) + # async with logger(root, name=name, console=self.console, **params) as logger: + # search_data = await self.extractor.run( + # search_data, + # logger, + # type_="search", + # tab=type_[0]) + # self.logger.info(f"搜索数据已保存至 {name}") + # # print(search_data) # 调试代码 + # return search_data + + @check_storage_format + async def hot_interactive(self, *args, ): + await self._deal_hot_data() + self.logger.info(_("已退出采集抖音热榜数据(抖音)模式")) + + async def _deal_hot_data( + self, + source=False, + cookie: str = None, + proxy: str = None, + ): + time_, board = await Hot(self.parameter, cookie, proxy, ).run() + if not any(board): + return None, None + if source: + return time_, [{Hot.board_params[i].name: j} for i, j in board] + root, params, logger = self.record.run(self.parameter, type_="hot") + data = [] + for i, j in board: + name = _("热榜数据_{time}_{name}").format(time=time_, name=Hot.board_params[i].name) + async with logger(root, name=name, console=self.console, **params) as record: + data.append( + {Hot.board_params[i].name: await self.extractor.run(j, record, type_="hot")}) + self.logger.info(_("热榜数据已储存至: 热榜数据_{time} + 榜单类型").format(time=time_)) + # print(time_, data, source) # 调试代码 + return time_, data + + @check_cookie_state(tiktok=False) + async def collection_interactive(self, *args, ): + if isinstance(sec_user_id := await self.__check_owner_url(), str): + start = time() + await self._deal_collection_data( + sec_user_id, + ) + self._time_statistics(start) + self.logger.info(_("已退出批量下载收藏作品(抖音)模式")) + + @check_cookie_state(tiktok=False) + async def collects_interactive(self, *args, key: str = "name", ): + if c := await self.__get_collects_list(key=key, ): + start = time() + for i in c: + await self._deal_collects_data( + i[key], + i["id"], + ) + self._time_statistics(start) + else: + self.logger.info(_("已退出批量下载收藏夹作品(抖音)模式")) + + async def __get_collects_list( + self, + cookie: str = None, + proxy: str | dict = None, + # api=False, + source=False, + key: str = "name", + *args, + **kwargs, + ): + collects = await Collects(self.parameter, cookie, proxy, ).run() + if not any(collects): + return None + if source: + return collects + data = self.extractor.extract_collects_info(collects) + return self.__input_download_index(data, _("收藏夹"), key, ) + + async def __check_owner_url(self, tiktok=False, ): + if not (sec_user_id := await self.check_sec_user_id(self.owner.url)): + self.logger.warning( + _("配置文件 owner_url 的 url 参数 {url} 无效").format(url=self.owner.url), + ) + if self.console.input( + _("程序无法获取账号信息,建议修改配置文件后重新运行,是否返回上一级菜单(YES/NO)") + ).upper != "NO": + return None + return "" + return sec_user_id + + @check_cookie_state(tiktok=False) + async def collection_music_interactive(self, *args, ): + start = time() + if data := await self.__handle_collection_music(*args, ): + data = await self.extractor.run(data, None, "music", ) + await self.downloader.run(data, type_="music", ) + self._time_statistics(start) + self.logger.info(_("已退出批量下载收藏音乐(抖音)模式")) + + def _time_statistics(self, start: float, ): + time_ = time() - start + self.logger.info( + _("程序运行耗时 {minutes} 分钟 {seconds} 秒").format(minutes=int(time_ // 60), seconds=int(time_ % 60))) + + async def __handle_collection_music( + self, + cookie: str = None, + proxy: str = None, + *args, + **kwargs, + ): + data = await CollectsMusic( + self.parameter, + cookie, + proxy, + *args, + **kwargs, + ).run() + return data if any(data) else None + + async def _deal_collection_data( + self, + sec_user_id: str, + api=False, + source=False, + cookie: str = None, + proxy: str = None, + tiktok=False, + ): + self.logger.info(_("开始获取收藏数据")) + if not (info := await self.get_user_info_data( + tiktok, + cookie, + proxy, + sec_user_id=sec_user_id, + )): + self.logger.warning(_("{sec_user_id} 获取账号信息失败").format(sec_user_id=sec_user_id)) + return + collection = await Collection( + self.parameter, + cookie, + proxy, + sec_user_id, + ).run() + if not any(collection): + return None + if source: + return collection + return await self._batch_process_detail( + collection, + api, + tiktok=tiktok, + mode="collection", + mark=self.owner.mark, + user_id=sec_user_id, + info=info, + ) + + async def _deal_collects_data( + self, + name: str, + id_: str, + api=False, + source=False, + cookie: str = None, + proxy: str = None, + tiktok=False, + ): + self.logger.info(_("开始获取收藏夹数据")) + data = await CollectsDetail( + self.parameter, + cookie, + proxy, + id_, + ).run() + if not any(data): + return None + if source: + return data + return await self._batch_process_detail( + data, + mode="collects", + collect_id=id_, + collect_name=name, + api=api, + tiktok=tiktok, + ) + + async def hashtag_interactive( + self, + cookie: str = None, + proxy: str = None, + *args, + **kwargs, + ): + await HashTag(self.parameter, cookie, proxy, ).run() + + async def run(self, run_command: list): + self.run_command = run_command + while self.running: + if not (select := safe_pop(self.run_command)): + select = choose( + _("请选择采集功能"), + [i for i, __ in self.__function], + self.console, + (10,), + ) + if select in {"Q", "q", }: + self.running = False + try: + n = int(select) - 1 + except ValueError: + break + if n in range(len(self.__function)): + await self.__function[n][1](safe_pop(self.run_command)) diff --git a/src/extract/extractor.py b/src/extract/extractor.py index 3e4b0dbb..c2a0d698 100644 --- a/src/extract/extractor.py +++ b/src/extract/extractor.py @@ -1,1234 +1,1276 @@ -from datetime import datetime -from json import dumps -from time import localtime -from time import strftime -from types import SimpleNamespace -from typing import TYPE_CHECKING -from urllib.parse import urlparse - -from ..custom import ( - VIDEO_INDEX, - IMAGE_INDEX, - IMAGE_TIKTOK_INDEX, - DYNAMIC_COVER_INDEX, - ORIGIN_COVER_INDEX, - MUSIC_INDEX, - COMMENT_IMAGE_INDEX, - COMMENT_STICKER_INDEX, - LIVE_COVER_INDEX, - AUTHOR_COVER_INDEX, - HOT_WORD_COVER_INDEX, - COMMENT_IMAGE_LIST_INDEX, - BITRATE_INFO_TIKTOK_INDEX, - LIVE_DATA_INDEX, - AVATAR_LARGER_INDEX, - AUTHOR_COVER_URL_INDEX, - SEARCH_USER_INDEX, - SEARCH_AVATAR_INDEX, - MUSIC_COLLECTION_COVER_INDEX, - MUSIC_COLLECTION_DOWNLOAD_INDEX, -) -from ..custom import condition_filter -from ..tools import TikTokDownloaderError -from ..translation import _ - -if TYPE_CHECKING: - from ..config import Parameter - from datetime import date - -__all__ = ["Extractor"] - - -class Extractor: - statistics_keys = ( - "digg_count", - "comment_count", - "collect_count", - "share_count", - "play_count", - ) - statistics_keys_tiktok = ( - "diggCount", - "commentCount", - "collectCount", - "shareCount", - "playCount", - ) - detail_necessary_keys = "id" - comment_necessary_keys = "cid" - user_necessary_keys = "sec_uid" - extract_params_tiktok = { - "sec_uid": "author.secUid", - "mix_id": "playlistId", - "uid": "author.id", - "nickname": "author.nickname", - "mix_title": "playlistId", # TikTok 不返回合辑标题 - } - extract_params = { - "sec_uid": "author.sec_uid", - "mix_id": "mix_info.mix_id", - "uid": "author.uid", - "nickname": "author.nickname", - "mix_title": "mix_info.mix_name", - } - - def __init__(self, params: "Parameter"): - self.log = params.logger - self.date_format = params.date_format - self.cleaner = params.CLEANER - self.type = { - "batch": self.__batch, - "detail": self.__detail, - "comment": self.__comment, - "live": self.__live, - "user": self.__user, - "search": self.__search, - "hot": self.__hot, - "music": self.__music, - } - - def get_user_info(self, data: dict) -> dict: - try: - return { - "nickname": data["nickname"], - "sec_uid": data["sec_uid"], - "uid": data["uid"], - } - except (KeyError, TypeError): - self.log.error(_("提取账号信息失败: {data}").format(data=data)) - return {} - - def get_user_info_tiktok(self, data: dict) -> dict: - try: - return { - "nickname": data["user"]["nickname"], - "sec_uid": data["user"]["secUid"], - "uid": data["user"]["id"], - } - except (KeyError, TypeError): - self.log.error(_("提取账号信息失败: {data}").format(data=data)) - return {} - - @staticmethod - def generate_data_object( - data: dict | list, - ) -> SimpleNamespace | list[SimpleNamespace]: - def depth_conversion(element): - if isinstance(element, dict): - return SimpleNamespace( - **{k: depth_conversion(v) for k, v in element.items()}) - elif isinstance(element, list): - return [depth_conversion(item) for item in element] - else: - return element - - return depth_conversion(data) - - @staticmethod - def safe_extract( - data: SimpleNamespace | list[SimpleNamespace], - attribute_chain: str, - default: str | int | list | dict | SimpleNamespace = "", - ): - attributes = attribute_chain.split(".") - for attribute in attributes: - if "[" in attribute: - parts = attribute.split("[", 1) - attribute = parts[0] - index = parts[1].split("]", 1)[0] - try: - index = int(index) - data = getattr(data, attribute, None)[index] - except (IndexError, TypeError, ValueError): - return default - else: - data = getattr(data, attribute, None) - if not data: - return default - return data or default - - async def run( - self, - data: list[dict], - recorder, - type_="detail", - tiktok=False, - **kwargs, - ) -> list[dict]: - if type_ not in self.type.keys(): - raise TikTokDownloaderError - return await self.type[type_](data, recorder, tiktok, **kwargs) - - async def __batch( - self, - data: list[dict], - recorder, - tiktok: bool, - name: str, - mark: str, - earliest, - latest, - same=True, - ) -> list[dict]: - """批量下载作品""" - container = SimpleNamespace( - all_data=[], - template={ - "collection_time": datetime.now().strftime(self.date_format), - }, - cache=None, - name=name, - mark=mark, - same=same, # 是否相同作者 - earliest=earliest, - latest=latest, - ) - self.__platform_classify_detail(data, container, tiktok, ) - container.all_data = self.__clean_extract_data( - container.all_data, - self.detail_necessary_keys, - ) - self.__extract_item_records(container.all_data) - await self.__record_data(recorder, container.all_data) - self.__date_filter(container) - self.__condition_filter(container) - self.__summary_detail(container.all_data) - return container.all_data - - @staticmethod - def __condition_filter(container: SimpleNamespace, ): - """自定义筛选作品""" - result = [i for i in container.all_data if condition_filter(i)] - container.all_data = result - - def __summary_detail(self, data: list[dict], ): - """汇总作品数量""" - self.log.info(_("筛选处理后作品数量: {count}").format(count=len(data))) - - def __extract_batch( - self, - container: SimpleNamespace, - data: SimpleNamespace, - ) -> None: - """批量提取作品信息""" - container.cache = container.template.copy() - self.__extract_detail_info(container.cache, data) - self.__extract_account_info(container, data) - self.__extract_music(container.cache, data) - self.__extract_statistics(container.cache, data) - self.__extract_tags(container.cache, data) - self.__extract_extra_info(container.cache, data) - self.__extract_additional_info(container.cache, data) - container.all_data.append(container.cache) - - def __extract_batch_tiktok( - self, - container: SimpleNamespace, - data: SimpleNamespace, - ) -> None: - """批量提取作品信息""" - container.cache = container.template.copy() - self.__extract_detail_info_tiktok(container.cache, data) - self.__extract_account_info_tiktok(container, data) - self.__extract_music(container.cache, data, True) - self.__extract_statistics_tiktok(container.cache, data) - self.__extract_tags_tiktok(container.cache, data) - self.__extract_extra_info_tiktok(container.cache, data) - self.__extract_additional_info(container.cache, data, True) - container.all_data.append(container.cache) - - def __extract_extra_info(self, item: dict, data: SimpleNamespace): - if e := self.safe_extract(data, "anchor_info"): - extra = dumps( - e, - ensure_ascii=False, - indent=2, - default=lambda x: vars(x)) - else: - extra = "" - item["extra"] = extra - - def __extract_extra_info_tiktok(self, item: dict, data: SimpleNamespace): - # TODO: 尚未适配 TikTok 额外信息 - item["extra"] = "" - - def __extract_commodity_data(self, item: dict, data: SimpleNamespace): - pass - - def __extract_game_data(self, item: dict, data: SimpleNamespace): - pass - - def __extract_description(self, data: SimpleNamespace) -> str: - # 2023/11/11: 抖音不再折叠过长的作品描述 - return self.safe_extract(data, "desc") - # if len(desc := self.safe_extract(data, "desc")) < 107: - # return desc - # long_desc = self.safe_extract(data, "share_info.share_link_desc") - # return long_desc.split( - # " ", 1)[-1].split(" %s", 1)[0].replace("# ", "#") - - def __clean_description(self, desc: str) -> str: - return self.cleaner.clear_spaces(self.cleaner.filter(desc)) - - def __format_date(self, data: int, ) -> str: - return strftime( - self.date_format, - localtime(data or None), - ) - - def __extract_detail_info(self, item: dict, data: SimpleNamespace) -> None: - item["id"] = self.safe_extract(data, "aweme_id") - item["desc"] = self.__clean_description( - self.__extract_description(data), - ) or item["id"] - item["create_timestamp"] = self.safe_extract(data, "create_time") - item["create_time"] = self.__format_date(item["create_timestamp"]) - self.__extract_text_extra(item, data) - self.__classifying_detail(item, data) - - def __extract_detail_info_tiktok( - self, - item: dict, - data: SimpleNamespace, - ) -> None: - item["id"] = self.safe_extract(data, "id") - item["desc"] = self.__clean_description( - self.__extract_description(data)) or item["id"] - item["create_timestamp"] = self.safe_extract(data, "createTime", ) - item["create_time"] = self.__format_date(item["create_timestamp"]) - self.__extract_text_extra_tiktok(item, data) - self.__classifying_detail_tiktok(item, data) - - def __classifying_detail(self, item: dict, data: SimpleNamespace) -> None: - # 作品分类 - if images := self.safe_extract(data, "images"): - self.__extract_image_info(item, data, images) - else: - self.__extract_video_info(item, data, _("视频"), ) - - def __classifying_detail_tiktok( - self, - item: dict, - data: SimpleNamespace) -> None: - if images := self.safe_extract(data, "imagePost.images"): - self.__extract_image_info_tiktok(item, data, images) - else: - self.__extract_video_info_tiktok(item, data, _("视频"), ) - - def __extract_additional_info( - self, - item: dict, - data: SimpleNamespace, - tiktok=False, - ): - item["height"] = self.safe_extract(data, "video.height", -1) - item["width"] = self.safe_extract(data, "video.width", -1) - item["ratio"] = self.safe_extract(data, "video.ratio") - item["share_url"] = self.__generate_link( - item["type"], - item["id"], - item["unique_id"] if tiktok else None, - ) - - @staticmethod - def __generate_link(type_: str, id_: str, unique_id: str = None, ) -> str: - match bool(unique_id), type_: - case True, "视频": - return f"https://www.tiktok.com/@{unique_id}/video/{id_}" - case True, "图集": - return f"https://www.tiktok.com/@{unique_id}/photo/{id_}" - case False, "视频": - return f"https://www.douyin.com/video/{id_}" - case False, "图集" | "实况": - return f"https://www.douyin.com/note/{id_}" - case _: - return "" - - @staticmethod - def __clean_share_url(url: str) -> str: - if not url: - return url - parsed_url = urlparse(url) - return f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" - - def __extract_image_info( - self, - item: dict, - data: SimpleNamespace, - images: list[SimpleNamespace], - ) -> None: - if self.safe_extract(images[-1], "video"): - self.__set_blank_data(item, data, _("实况"), ) - item["downloads"] = [ - self.__classify_slides_item(i, ) for i in images - ] - else: - self.__set_blank_data(item, data, _("图集"), ) - item["downloads"] = [ - self.safe_extract( - i, - f'url_list[{IMAGE_INDEX}]', - ) for i in images - ] - - def __extract_image_info_tiktok( - self, - item: dict, - data: SimpleNamespace, - images: list) -> None: - self.__set_blank_data(item, data, _("图集"), ) - item["downloads"] = [ - self.safe_extract( - i, - f"imageURL.urlList[{IMAGE_TIKTOK_INDEX}]", - ) for i in images - ] - - def __set_blank_data( - self, - item: dict, - data: SimpleNamespace, - type_=_("图集"), - ): - item["type"] = type_ - item["duration"] = "00:00:00" - item["uri"] = "" - self.__extract_cover(item, data) - - def __extract_video_info( - self, - item: dict, - data: SimpleNamespace, - type_=_("视频"), - ) -> None: - item["type"] = type_ - item["downloads"] = self.__extract_video_download(data, ) - item["duration"] = self.time_conversion( - self.safe_extract(data, "video.duration", 0)) - item["uri"] = self.safe_extract( - data, "video.play_addr.uri") - self.__extract_cover(item, data, True) - - def __classify_slides_item(self, item: SimpleNamespace, ) -> str: - if self.safe_extract(item, "video"): - return self.__extract_video_download(item, ) - return self.safe_extract(item, f'url_list[{IMAGE_INDEX}]') - - def __extract_video_download(self, data: SimpleNamespace, ) -> str: - bit_rate: list[SimpleNamespace] = self.safe_extract( - data, - "video.bit_rate", - [], - ) - bit_rate: list[tuple[int, int, str, list[SimpleNamespace]]] = [( - i.FPS, - i.bit_rate, - i.gear_name, - i.play_addr - ) for i in bit_rate] - bit_rate.sort(key=lambda x: (int(x[2].split("_")[-2]), x[1], x[0],), ) - return self.safe_extract( - bit_rate[-1][-1], - f"url_list[{VIDEO_INDEX}]", - ) if bit_rate else "" - - def __extract_video_info_tiktok( - self, - item: dict, - data: SimpleNamespace, - type_=_("视频"), - ) -> None: - item["type"] = type_ - item["downloads"] = self.safe_extract( - data, "video.playAddr") - item["duration"] = self.time_conversion_tiktok( - self.safe_extract( - data, - "video.duration", - 0, - ) - ) - item["uri"] = self.safe_extract( - data, f"video.bitrateInfo[{BITRATE_INFO_TIKTOK_INDEX}].PlayAddr.Uri", - ) - self.__extract_cover_tiktok(item, data, True) - - @staticmethod - def time_conversion(time_: int) -> str: - second = time_ // 1000 - return f"{second // 3600:0>2d}:{second % 3600 // 60:0>2d}:{second % 3600 % 60:0>2d}" - - @staticmethod - def time_conversion_tiktok(seconds: int) -> str: - minutes, seconds = divmod(seconds, 60) - hours, minutes = divmod(minutes, 60) - return '{:02d}:{:02d}:{:02d}'.format( - int(hours), int(minutes), int(seconds)) - - def __extract_text_extra(self, item: dict, data: SimpleNamespace): - """作品标签""" - text = [ - self.safe_extract(i, "hashtag_name") - for i in self.safe_extract( - data, "text_extra", [] - ) - ] - item["text_extra"] = [i for i in text if i] - - def __extract_text_extra_tiktok(self, item: dict, data: SimpleNamespace): - """作品标签""" - text = [ - self.safe_extract(i, "hashtagName") - for i in self.safe_extract( - data, "textExtra", [] - ) - ] - item["text_extra"] = [i for i in text if i] - - def __extract_cover( - self, - item: dict, - data: SimpleNamespace, - has=False, - ) -> None: - if has: - # 动态封面图链接 - item["dynamic_cover"] = self.safe_extract( - data, f"video.dynamic_cover.url_list[{DYNAMIC_COVER_INDEX}]") - # 静态封面图链接 - item["origin_cover"] = self.safe_extract( - data, f"video.origin_cover.url_list[{ORIGIN_COVER_INDEX}]") - else: - item["dynamic_cover"], item["origin_cover"] = "", "" - - def __extract_cover_tiktok( - self, - item: dict, - data: SimpleNamespace, - has=False, - ) -> None: - if has: - # 动态封面图链接 - item["dynamic_cover"] = self.safe_extract( - data, "video.dynamicCover") - # 静态封面图链接 - item["origin_cover"] = self.safe_extract( - data, "video.originCover") - else: - item["dynamic_cover"], item["origin_cover"] = "", "" - - def __extract_music( - self, - item: dict, - data: SimpleNamespace, - tiktok=False, - ) -> None: - if music_data := self.safe_extract(data, "music"): - if tiktok: - author = self.safe_extract(music_data, "authorName") - title = self.safe_extract(music_data, "title") - url = self.safe_extract( - music_data, "playUrl") - else: - author = self.safe_extract(music_data, "author") - title = self.safe_extract(music_data, "title") - url = self.safe_extract( - music_data, f"play_url.url_list[{MUSIC_INDEX}]", - ) # 部分作品的音乐无法下载 - - else: - author, title, url = "", "", "" - item["music_author"] = author - item["music_title"] = title - item["music_url"] = url - - def __extract_statistics(self, item: dict, data: SimpleNamespace) -> None: - data = self.safe_extract(data, "statistics") - for i in self.statistics_keys: - item[i] = self.safe_extract(data, i, -1, ) - - def __extract_statistics_tiktok( - self, - item: dict, - data: SimpleNamespace) -> None: - data = self.safe_extract(data, "stats") - for i, j in enumerate(self.statistics_keys_tiktok): - item[self.statistics_keys[i]] = self.safe_extract(data, j, -1, ) - - def __extract_tags(self, item: dict, data: SimpleNamespace) -> None: - if not (t := self.safe_extract(data, "video_tag")): - item["tag"] = [] - else: - item["tag"] = [self.safe_extract(i, "tag_name") for i in t] - - def __extract_tags_tiktok(self, item: dict, data: SimpleNamespace) -> None: - if not (t := self.safe_extract(data, "textExtra")): - item["tag"] = [] - else: - item["tag"] = [self.safe_extract(i, "hashtagName") for i in t] - - def __extract_account_info( - self, - container: SimpleNamespace, - data: SimpleNamespace, - key="author", - ) -> None: - data = self.safe_extract(data, key) - container.cache["uid"] = self.safe_extract(data, "uid") - container.cache["sec_uid"] = self.safe_extract(data, "sec_uid") - # container.cache["short_id"] = self.safe_extract(data, "short_id") - container.cache["unique_id"] = self.safe_extract(data, "unique_id", ) - container.cache["signature"] = self.safe_extract(data, "signature") - container.cache["user_age"] = self.safe_extract(data, "user_age", -1) - self.__extract_nickname_info(container, data) - - def __extract_account_info_tiktok( - self, - container: SimpleNamespace, - data: SimpleNamespace, - key="author", - ) -> None: - data = self.safe_extract(data, key) - container.cache["uid"] = self.safe_extract(data, "id") - container.cache["sec_uid"] = self.safe_extract(data, "secUid") - container.cache["unique_id"] = self.safe_extract(data, "uniqueId") - container.cache["signature"] = self.safe_extract(data, "signature") - container.cache["user_age"] = -1 - self.__extract_nickname_info(container, data) - - def __extract_nickname_info( - self, - container: SimpleNamespace, - data: SimpleNamespace, - ) -> None: - if container.same: - container.cache["nickname"] = container.name - container.cache["mark"] = container.mark or container.name - else: - name = self.cleaner.filter_name( - self.safe_extract( - data, - "nickname", - _("已注销账号")), - default=_("无效账号昵称"), - ) - container.cache["nickname"] = name - container.cache["mark"] = name - - def preprocessing_data( - self, - data: list[dict] | dict, - tiktok: bool = False, - mode: str = ..., - mark: str = "", - user_id: str = "", - mix_id: str = "", - mix_title: str = "", - collect_id: str = "", - collect_name: str = "", - ) -> tuple[str, str, str,]: - if isinstance(data, dict): - info = self.get_user_info_tiktok(data) if tiktok else self.get_user_info(data) - if user_id != (s := info.get("sec_uid")): - self.log.error( - _("sec_user_id {user_id} 与 {s} 不一致").format(user_id=user_id, s=s), - ) - return "", "", "" - name = self.cleaner.filter_name( - info["nickname"], - info["uid"], - ) - mark = self.cleaner.filter_name( - mark, - name, - ) - return ( - info["uid"], - name, - mark, - ) - elif isinstance(data, list): - match mode: - case "post": - item = self.__select_item( - data, - user_id, - (self.extract_params_tiktok if tiktok else self.extract_params)["sec_uid"], - ) - id_, name, mark = self.__extract_pretreatment_data( - item, - (self.extract_params_tiktok if tiktok else self.extract_params)["uid"], - (self.extract_params_tiktok if tiktok else self.extract_params)["nickname"], - mark, - ) - return id_, name, mark - case "mix": - item = self.__select_item( - data, - mix_id, - (self.extract_params_tiktok if tiktok else self.extract_params)["mix_id"], - ) - id_, name, mark = self.__extract_pretreatment_data( - item, - (self.extract_params_tiktok if tiktok else self.extract_params)["mix_id"], - (self.extract_params_tiktok if tiktok else self.extract_params)["mix_title"], - mark, - mix_title, - ) - return id_, name, mark - case "favorite" | "collection": - pass - case "collects": - collect_name = self.cleaner.filter_name( - collect_name, - collect_id, - ) - return collect_id, collect_name, collect_name - else: - raise TikTokDownloaderError - - def __select_item(self, data: list[dict], id_: str, key: str): - """从多个数据返回对象""" - for item in data: - item = self.generate_data_object(item) - if id_ == self.safe_extract(item, key): - return item - raise TikTokDownloaderError(_("提取账号信息或合集信息失败,请向作者反馈!")) - - def __extract_pretreatment_data( - self, - item: SimpleNamespace, - id_: str, - name: str, - mark: str, - title: str = None, # TikTok 合辑需要直接传入标题 - ): - id_ = self.safe_extract(item, id_) - name = self.cleaner.filter_name( - title or self.safe_extract( - item, - name, - id_, - ), - ) - mark = self.cleaner.filter_name(mark, name, ) - return id_, name.strip(), mark.strip() - - def __platform_classify_detail( - self, - data: list[dict], - container: SimpleNamespace, - tiktok: bool) -> None: - if tiktok: - [ - self.__extract_batch_tiktok( - container, - self.generate_data_object(item), - ) for item in data - ] - else: - [ - self.__extract_batch( - container, - self.generate_data_object(item), - ) - for item in data - ] - - async def __detail( - self, - data: list[dict], - recorder, - tiktok: bool, - ) -> list[dict]: - container = SimpleNamespace( - all_data=[], - template={ - "collection_time": datetime.now().strftime(self.date_format), - }, - cache=None, - same=False, - ) - self.__platform_classify_detail(data, container, tiktok, ) - container.all_data = self.__clean_extract_data( - container.all_data, self.detail_necessary_keys) - self.__extract_item_records(container.all_data) - await self.__record_data(recorder, container.all_data) - self.__condition_filter(container) - return container.all_data - - async def __comment(self, data: list[dict], recorder, tiktok: bool, - source=False) -> list[dict]: - if not any(data): - return [] - container = SimpleNamespace( - all_data=[], - template={ - "collection_time": datetime.now().strftime(self.date_format), - }, - cache=None, - same=False, - ) - if source: - container.all_data = data - else: - [self.__extract_comments_data( - container, self.generate_data_object(i)) for i in data] - container.all_data = self.__clean_extract_data( - container.all_data, self.comment_necessary_keys) - await self.__record_data(recorder, container.all_data) - return container.all_data - - def __extract_comments_data( - self, - container: SimpleNamespace, - data: SimpleNamespace): - container.cache = container.template.copy() - container.cache["create_timestamp"] = self.safe_extract( - data, "create_time") - container.cache["create_time"] = self.__format_date( - container.cache["create_timestamp"]) - container.cache["ip_label"] = self.safe_extract(data, "ip_label", "未知") - container.cache["text"] = self.safe_extract(data, "text") - container.cache["image"] = self.safe_extract( - data, f"image_list[{COMMENT_IMAGE_LIST_INDEX}].origin_url.url_list[{COMMENT_IMAGE_INDEX}]") - container.cache["sticker"] = self.safe_extract( - data, f"sticker.static_url.url_list[{COMMENT_STICKER_INDEX}]") - container.cache["digg_count"] = self.safe_extract(data, "digg_count", -1) - container.cache["reply_to_reply_id"] = self.safe_extract( - data, "reply_to_reply_id") - container.cache["reply_comment_total"] = self.safe_extract(data, "reply_comment_total", 0) - container.cache["reply_id"] = self.safe_extract(data, "reply_id") - container.cache["cid"] = self.safe_extract(data, "cid") - self.__extract_account_info(container, data, "user") - container.all_data.append(container.cache) - - @classmethod - def extract_reply_ids(cls, data: list[dict]) -> list[str]: - container = SimpleNamespace( - reply_ids=[], - cache=None, - ) - for item in data: - item = cls.generate_data_object(item) - container.cache = { - "reply_comment_total": cls.safe_extract( - item, - "reply_comment_total", - 0, - ), - "cid": cls.safe_extract(item, "cid"), - } - cls.__filter_reply_ids(container) - return container.reply_ids - - @staticmethod - def __filter_reply_ids(container: SimpleNamespace): - if container.cache["reply_comment_total"] > 0: - container.reply_ids.append(container.cache["cid"]) - - async def __live( - self, - data: list[dict], - recorder, - tiktok: bool, - *args) -> list[dict]: - container = SimpleNamespace(all_data=[]) - if tiktok: - [self.__extract_live_data_tiktok( - container, self.generate_data_object(i)) for i in data] - else: - [self.__extract_live_data( - container, self.generate_data_object(i)) for i in data] - return container.all_data - - def __extract_live_data( - self, - container: SimpleNamespace, - data: SimpleNamespace): - data = self.safe_extract( - data, f"data.data[{LIVE_DATA_INDEX}]") or self.safe_extract( - data, "data.room") - live_data = { - "status": self.safe_extract(data, "status"), - "nickname": self.safe_extract(data, "owner.nickname"), - "title": self.safe_extract(data, "title"), - "flv_pull_url": vars( - self.safe_extract( - data, - "stream_url.flv_pull_url", - SimpleNamespace(), - ) - ), - "hls_pull_url_map": vars( - self.safe_extract( - data, - "stream_url.hls_pull_url_map", - SimpleNamespace(), - ) - ), - "cover": self.safe_extract(data, f"cover.url_list[{LIVE_COVER_INDEX}]"), - "total_user_str": self.safe_extract(data, "stats.total_user_str"), - "user_count_str": self.safe_extract(data, "stats.user_count_str"), - } - container.all_data.append(live_data) - - def __extract_live_data_tiktok( - self, - container: SimpleNamespace, - data: SimpleNamespace): - data = self.safe_extract(data, "data") - live_data = { - "create_time": datetime.fromtimestamp(t) if ( - t := self.safe_extract(data, "create_time")) else "未知", - "id_str": self.safe_extract(data, "id_str"), - "like_count": self.safe_extract(data, "like_count"), - "nickname": self.safe_extract(data, "owner.nickname"), - "display_id": self.safe_extract(data, "owner.display_id"), - "title": self.safe_extract(data, "title"), - "user_count": self.safe_extract(data, "user_count"), - "flv_pull_url": vars(self.safe_extract(data, "stream_url.flv_pull_url")), - "message": self.safe_extract(data, "message"), - "prompts": self.safe_extract(data, "prompts"), - } - container.all_data.append(live_data) - - async def __user( - self, - data: list[dict], - recorder, - tiktok: bool, - ) -> list[dict]: - container = SimpleNamespace( - all_data=[], - cache=None, - template={ - "collection_time": datetime.now().strftime(self.date_format), - }, - ) - [self.__extract_user_data(container, - self.generate_data_object(i)) for i in data] - container.all_data = self.__clean_extract_data( - container.all_data, self.user_necessary_keys) - await self.__record_data(recorder, container.all_data) - return container.all_data - - def __extract_user_data( - self, - container: SimpleNamespace, - data: SimpleNamespace, - ): - container.cache = container.template.copy() - container.cache["avatar"] = self.safe_extract( - data, f"avatar_larger.url_list[{AVATAR_LARGER_INDEX}]") - container.cache["city"] = self.safe_extract(data, "city") - container.cache["country"] = self.safe_extract(data, "country") - container.cache["district"] = self.safe_extract(data, "district") - container.cache["favoriting_count"] = self.safe_extract(data, "favoriting_count", -1) - container.cache["follower_count"] = self.safe_extract(data, "follower_count", -1) - container.cache["max_follower_count"] = self.safe_extract(data, "max_follower_count", -1) - container.cache["following_count"] = self.safe_extract(data, "following_count", -1) - container.cache["total_favorited"] = self.safe_extract(data, "total_favorited", -1) - container.cache["gender"] = {1: "男", 2: "女"}.get( - self.safe_extract(data, "gender"), - "未知", - ) - container.cache["ip_location"] = self.safe_extract(data, "ip_location") - container.cache["nickname"] = self.safe_extract(data, "nickname") - container.cache["province"] = self.safe_extract(data, "province") - container.cache["school_name"] = self.safe_extract(data, "school_name") - container.cache["sec_uid"] = self.safe_extract(data, "sec_uid") - container.cache["signature"] = self.safe_extract(data, "signature") - container.cache["uid"] = self.safe_extract(data, "uid") - container.cache["unique_id"] = self.safe_extract(data, "unique_id") - container.cache["user_age"] = self.safe_extract(data, "user_age", -1) - container.cache["cover"] = self.safe_extract( - data, f"cover_url[{AUTHOR_COVER_URL_INDEX}].url_list[{AUTHOR_COVER_INDEX}]") - container.cache["short_id"] = self.safe_extract(data, "short_id") - container.cache["aweme_count"] = self.safe_extract(data, "aweme_count", -1) - container.cache["verify"] = self.safe_extract( - data, "custom_verify", "无") - container.cache["enterprise"] = self.safe_extract( - data, "enterprise_verify_reason", "无") - container.cache["url"] = f"https://www.douyin.com/user/{container.cache["sec_uid"]}" - container.all_data.append(container.cache) - - async def __search( - self, - data: list[dict], - recorder, - tiktok: bool, - tab: int) -> list[dict]: - if tab in {0, 1}: - return await self.__search_general(data, recorder) - elif tab == 2: - return await self.__search_user(data, recorder) - elif tab == 3: - return await self.__search_live(data, recorder) - - async def __search_general(self, data: list[dict], recorder) -> list[dict]: - container = SimpleNamespace( - all_data=[], - cache=None, - template={ - "collection_time": datetime.now().strftime(self.date_format), - }, - same=False, - ) - [self.__search_result_classify( - container, self.generate_data_object(i)) for i in data] - await self.__record_data(recorder, container.all_data) - return container.all_data - - def __search_result_classify( - self, - container: SimpleNamespace, - data: SimpleNamespace): - if d := self.safe_extract(data, "aweme_info"): - self.__extract_batch(container, d) - elif d := self.safe_extract(data, "aweme_mix_info.mix_items"): - [self.__extract_batch(container, i) for i in d] - elif d := self.safe_extract(data, "card_info.attached_info.aweme_list"): - [self.__extract_batch(container, i) for i in d] - elif d := self.safe_extract(data, f"user_list[{SEARCH_USER_INDEX}].items"): - [self.__extract_batch(container, i) for i in d] - # elif d := self.safe_extract(data, "user_list.user_info"): - # pass - # elif d := self.safe_extract(data, "music_list"): - # pass - # elif d := self.safe_extract(data, "common_aladdin"): - # pass - self.log.error(f"Unreported search results: {data}", False) - - async def __search_user( - self, - data: list[dict], - recorder) -> list[dict]: - container = SimpleNamespace( - all_data=[], - cache=None, - template={ - "collection_time": datetime.now().strftime(self.date_format), - }, - ) - [self.__deal_search_user_live(container, self.generate_data_object( - i["user_info"])) for i in data] - await self.__record_data(recorder, container.all_data) - return container.all_data - - def __deal_search_user_live( - self, - container: SimpleNamespace, - data: SimpleNamespace, - user=True, - ): - if user: - container.cache = container.template.copy() - container.cache["avatar"] = self.safe_extract( - data, f"{'avatar_thumb' if user else 'avatar_larger'}.url_list[{SEARCH_AVATAR_INDEX}]") - container.cache["nickname"] = self.safe_extract(data, "nickname") - container.cache["sec_uid"] = self.safe_extract(data, "sec_uid") - container.cache["signature"] = self.safe_extract(data, "signature") - container.cache["uid"] = self.safe_extract(data, "uid") - container.cache["short_id"] = self.safe_extract(data, "short_id") - container.cache["verify"] = self.safe_extract( - data, "custom_verify", "无") - container.cache["enterprise"] = self.safe_extract( - data, "enterprise_verify_reason", "无") - if user: - container.cache["follower_count"] = self.safe_extract(data, "follower_count", -1) - container.cache["total_favorited"] = self.safe_extract(data, "total_favorited", -1) - container.cache["unique_id"] = self.safe_extract(data, "unique_id") - container.all_data.append(container.cache) - # else: - # pass - - async def __search_live( - self, - data: list[dict], - recorder) -> list[dict]: - container = SimpleNamespace( - all_data=[], - cache=None, - template={ - "collection_time": datetime.now().strftime(self.date_format), - }, - ) - [self.__deal_search_live( - container, self.generate_data_object(i["lives"])) for i in data] - await self.__record_data(recorder, container.all_data) - return container.all_data - - def __deal_search_live(self, - container: SimpleNamespace, - data: SimpleNamespace): - container.cache = container.template.copy() - self.__deal_search_user_live( - container, self.safe_extract( - data, "author"), False) - container.cache["room_id"] = self.safe_extract(data, "aweme_id") - container.all_data.append(container.cache) - - async def __hot( - self, - data: list[dict], - recorder, - tiktok: bool, - ) -> list[dict]: - all_data = [] - [self.__deal_hot_data(all_data, self.generate_data_object(i)) - for i in data] - await self.__record_data(recorder, all_data) - return all_data - - def __deal_hot_data(self, container: list, data: SimpleNamespace): - cache = { - "position": str(self.safe_extract(data, "position", -1)), - "sentence_id": self.safe_extract(data, "sentence_id"), - "word": self.safe_extract(data, "word"), - "video_count": str(self.safe_extract(data, "video_count", -1)), - "event_time": self.__format_date(self.safe_extract(data, "event_time")), - "view_count": str(self.safe_extract(data, "view_count", -1)), - "hot_value": str(self.safe_extract(data, "hot_value", -1)), - "cover": self.safe_extract(data, f"word_cover.url_list[{HOT_WORD_COVER_INDEX}]"), - } - container.append(cache) - - async def __record_data(self, record, data: list[dict]): - # 记录数据 - for i in data: - await record.save(self.__extract_values(record, i)) - - @staticmethod - def __extract_values(record, data: dict) -> list: - return [data[key] for key in record.field_keys] - - @staticmethod - def __date_filter(container: SimpleNamespace): - # print("前", len(container.all_data)) # 调试代码 - result = [] - for item in container.all_data: - create_time = datetime.fromtimestamp( - item["create_timestamp"]).date() - if container.earliest <= create_time <= container.latest: - result.append(item) - # else: - # print("丢弃", item) # 调试代码 - # print("后", len(result)) # 调试代码 - container.all_data = result - - def source_date_filter( - self, - data: list[dict], - earliest: "date", - latest: "date", - tiktok=False, - ) -> list[dict]: - if tiktok: - return self.__source_date_filter( - data, - "createTime", - earliest=earliest, - latest=latest, - ) - return self.__source_date_filter( - data, - earliest=earliest, - latest=latest, - ) - - def __source_date_filter( - self, - data: list[dict], - key: str = "create_time", - earliest: "date" = ..., - latest: "date" = ..., - ) -> list[dict]: - result = [] - for item in data: - if not (create_time := item.get(key, 0)): - result.append(item) - continue - create_time = datetime.fromtimestamp(create_time).date() - if earliest <= create_time <= latest: - result.append(item) - self.__summary_detail(result) - return result - - @classmethod - def extract_mix_id(cls, data: dict) -> str: - data = cls.generate_data_object(data) - return cls.safe_extract(data, "mix_info.mix_id") - - def __extract_item_records(self, data: list[dict]): - # 记录提取成功的条目 - for i in data: - self.log.info(f"{i['type']} {i['id']} 数据提取成功", False) - - @classmethod - def extract_mix_collect_info(cls, data: list[dict]) -> list[dict]: - data = cls.generate_data_object(data) - return [ - { - "title": Extractor.safe_extract(i, "mix_name"), - "id": Extractor.safe_extract(i, "mix_id"), - } - for i in data - ] - - @classmethod - def extract_collects_info(cls, data: list[dict]) -> list[dict]: - data = cls.generate_data_object(data) - return [ - { - "name": Extractor.safe_extract(i, "collects_name"), - "id": Extractor.safe_extract(i, "collects_id_str"), - } - for i in data - ] - - @staticmethod - def __clean_extract_data(data: list[dict], key: str) -> list[dict]: - # 去除无效数据 - return [i for i in data if i.get(key)] - - async def __music( - self, - data: list[dict], - recorder, - tiktok=False, - ) -> list[dict]: - """暂不记录收藏音乐数据""" - container = SimpleNamespace( - all_data=[], - template={ - "collection_time": datetime.now().strftime(self.date_format), - }, - cache=None, - same=False, - ) - [ - self.__extract_collection_music( - container, - self.generate_data_object(item), - ) for item in data - ] - return container.all_data - - def __extract_collection_music( - self, - container: SimpleNamespace, - data: SimpleNamespace, - ): - container.cache = container.template.copy() - container.cache["id"] = self.safe_extract(data, "id_str") - container.cache["title"] = self.safe_extract(data, "title") - container.cache["author"] = self.safe_extract(data, "author") - container.cache["album"] = self.safe_extract(data, "album") - container.cache["cover"] = self.safe_extract( - data, f"cover_hd.url_list[{MUSIC_COLLECTION_COVER_INDEX}]") - container.cache["download"] = self.safe_extract( - data, f"play_url.url_list[{MUSIC_COLLECTION_DOWNLOAD_INDEX}]") - container.cache["duration"] = self.time_conversion( - self.safe_extract(data, "duration", 0)) - container.all_data.append(container.cache) +from datetime import datetime +from json import dumps +from time import localtime +from time import strftime +from types import SimpleNamespace +from typing import TYPE_CHECKING +from urllib.parse import urlparse + +from ..custom import ( + VIDEO_INDEX, + IMAGE_INDEX, + IMAGE_TIKTOK_INDEX, + DYNAMIC_COVER_INDEX, + ORIGIN_COVER_INDEX, + MUSIC_INDEX, + COMMENT_IMAGE_INDEX, + COMMENT_STICKER_INDEX, + LIVE_COVER_INDEX, + AUTHOR_COVER_INDEX, + HOT_WORD_COVER_INDEX, + COMMENT_IMAGE_LIST_INDEX, + BITRATE_INFO_TIKTOK_INDEX, + LIVE_DATA_INDEX, + AVATAR_LARGER_INDEX, + AUTHOR_COVER_URL_INDEX, + SEARCH_USER_INDEX, + SEARCH_AVATAR_INDEX, + MUSIC_COLLECTION_COVER_INDEX, + MUSIC_COLLECTION_DOWNLOAD_INDEX, +) +from ..custom import condition_filter +from ..tools import TikTokDownloaderError +from ..translation import _ + +if TYPE_CHECKING: + from ..config import Parameter + from datetime import date + +__all__ = ["Extractor"] + + +class Extractor: + statistics_keys = ( + "digg_count", + "comment_count", + "collect_count", + "share_count", + "play_count", + ) + statistics_keys_tiktok = ( + "diggCount", + "commentCount", + "collectCount", + "shareCount", + "playCount", + ) + detail_necessary_keys = "id" + comment_necessary_keys = "cid" + user_necessary_keys = "sec_uid" + extract_params_tiktok = { + "sec_uid": "author.secUid", + "mix_id": "playlistId", + "uid": "author.id", + "nickname": "author.nickname", + "mix_title": "playlistId", # TikTok 不返回合辑标题 + } + extract_params = { + "sec_uid": "author.sec_uid", + "mix_id": "mix_info.mix_id", + "uid": "author.uid", + "nickname": "author.nickname", + "mix_title": "mix_info.mix_name", + } + + def __init__(self, params: "Parameter"): + self.log = params.logger + self.date_format = params.date_format + self.cleaner = params.CLEANER + self.type = { + "batch": self.__batch, + "detail": self.__detail, + "comment": self.__comment, + "live": self.__live, + "user": self.__user, + "search": self.__search, + "hot": self.__hot, + "music": self.__music, + } + + def get_user_info(self, data: dict) -> dict: + try: + return { + "nickname": data["nickname"], + "sec_uid": data["sec_uid"], + "uid": data["uid"], + } + except (KeyError, TypeError): + self.log.error(_("提取账号信息失败: {data}").format(data=data)) + return {} + + def get_user_info_tiktok(self, data: dict) -> dict: + try: + return { + "nickname": data["user"]["nickname"], + "sec_uid": data["user"]["secUid"], + "uid": data["user"]["id"], + } + except (KeyError, TypeError): + self.log.error(_("提取账号信息失败: {data}").format(data=data)) + return {} + + @staticmethod + def generate_data_object( + data: dict | list, + ) -> SimpleNamespace | list[SimpleNamespace]: + def depth_conversion(element): + if isinstance(element, dict): + return SimpleNamespace( + **{k: depth_conversion(v) for k, v in element.items()}) + elif isinstance(element, list): + return [depth_conversion(item) for item in element] + else: + return element + + return depth_conversion(data) + + @staticmethod + def safe_extract( + data: SimpleNamespace | list[SimpleNamespace], + attribute_chain: str, + default: str | int | list | dict | SimpleNamespace = "", + ): + attributes = attribute_chain.split(".") + for attribute in attributes: + if "[" in attribute: + parts = attribute.split("[", 1) + attribute = parts[0] + index = parts[1].split("]", 1)[0] + try: + index = int(index) + data = getattr(data, attribute, None)[index] + except (IndexError, TypeError, ValueError): + return default + else: + data = getattr(data, attribute, None) + if not data: + return default + return data or default + + async def run( + self, + data: list[dict], + recorder, + type_="detail", + tiktok=False, + **kwargs, + ) -> list[dict]: + if type_ not in self.type.keys(): + raise TikTokDownloaderError + return await self.type[type_](data, recorder, tiktok, **kwargs) + + async def __batch( + self, + data: list[dict], + recorder, + tiktok: bool, + name: str, + mark: str, + earliest, + latest, + same=True, + ) -> list[dict]: + """批量下载作品""" + container = SimpleNamespace( + all_data=[], + template={ + "collection_time": datetime.now().strftime(self.date_format), + }, + cache=None, + name=name, + mark=mark, + same=same, # 是否相同作者 + earliest=earliest, + latest=latest, + ) + self.__platform_classify_detail(data, container, tiktok, ) + container.all_data = self.__clean_extract_data( + container.all_data, + self.detail_necessary_keys, + ) + self.__extract_item_records(container.all_data) + await self.__record_data(recorder, container.all_data) + self.__date_filter(container) + self.__condition_filter(container) + self.__summary_detail(container.all_data) + return container.all_data + + @staticmethod + def __condition_filter(container: SimpleNamespace, ): + """自定义筛选作品""" + result = [i for i in container.all_data if condition_filter(i)] + container.all_data = result + + def __summary_detail(self, data: list[dict], ): + """汇总作品数量""" + self.log.info(_("筛选处理后作品数量: {count}").format(count=len(data))) + + def __extract_batch( + self, + container: SimpleNamespace, + data: SimpleNamespace, + ) -> None: + """批量提取作品信息""" + container.cache = container.template.copy() + self.__extract_detail_info(container.cache, data) + self.__extract_account_info(container, data) + self.__extract_music(container.cache, data) + self.__extract_statistics(container.cache, data) + self.__extract_tags(container.cache, data) + self.__extract_extra_info(container.cache, data) + self.__extract_additional_info(container.cache, data) + container.all_data.append(container.cache) + + def __extract_batch_tiktok( + self, + container: SimpleNamespace, + data: SimpleNamespace, + ) -> None: + """批量提取作品信息""" + container.cache = container.template.copy() + self.__extract_detail_info_tiktok(container.cache, data) + self.__extract_account_info_tiktok(container, data) + self.__extract_music(container.cache, data, True) + self.__extract_statistics_tiktok(container.cache, data) + self.__extract_tags_tiktok(container.cache, data) + self.__extract_extra_info_tiktok(container.cache, data) + self.__extract_additional_info(container.cache, data, True) + container.all_data.append(container.cache) + + def __extract_extra_info(self, item: dict, data: SimpleNamespace): + if e := self.safe_extract(data, "anchor_info"): + extra = dumps( + e, + ensure_ascii=False, + indent=2, + default=lambda x: vars(x)) + else: + extra = "" + item["extra"] = extra + + def __extract_extra_info_tiktok(self, item: dict, data: SimpleNamespace): + # TODO: 尚未适配 TikTok 额外信息 + item["extra"] = "" + + def __extract_commodity_data(self, item: dict, data: SimpleNamespace): + pass + + def __extract_game_data(self, item: dict, data: SimpleNamespace): + pass + + def __extract_description(self, data: SimpleNamespace) -> str: + # 2023/11/11: 抖音不再折叠过长的作品描述 + return self.safe_extract(data, "desc") + # if len(desc := self.safe_extract(data, "desc")) < 107: + # return desc + # long_desc = self.safe_extract(data, "share_info.share_link_desc") + # return long_desc.split( + # " ", 1)[-1].split(" %s", 1)[0].replace("# ", "#") + + def __clean_description(self, desc: str) -> str: + return self.cleaner.clear_spaces(self.cleaner.filter(desc)) + + def __format_date(self, data: int, ) -> str: + return strftime( + self.date_format, + localtime(data or None), + ) + + def __extract_detail_info(self, item: dict, data: SimpleNamespace) -> None: + item["id"] = self.safe_extract(data, "aweme_id") + item["desc"] = self.__clean_description( + self.__extract_description(data), + ) or item["id"] + item["create_timestamp"] = self.safe_extract(data, "create_time") + item["create_time"] = self.__format_date(item["create_timestamp"]) + self.__extract_text_extra(item, data) + self.__classifying_detail(item, data) + + def __extract_detail_info_tiktok( + self, + item: dict, + data: SimpleNamespace, + ) -> None: + item["id"] = self.safe_extract(data, "id") + item["desc"] = self.__clean_description( + self.__extract_description(data)) or item["id"] + item["create_timestamp"] = self.safe_extract(data, "createTime", ) + item["create_time"] = self.__format_date(item["create_timestamp"]) + self.__extract_text_extra_tiktok(item, data) + self.__classifying_detail_tiktok(item, data) + + def __classifying_detail(self, item: dict, data: SimpleNamespace) -> None: + # 作品分类 + if images := self.safe_extract(data, "images"): + self.__extract_image_info(item, data, images) + else: + self.__extract_video_info(item, data, _("视频"), ) + + def __classifying_detail_tiktok( + self, + item: dict, + data: SimpleNamespace) -> None: + if images := self.safe_extract(data, "imagePost.images"): + self.__extract_image_info_tiktok(item, data, images) + else: + self.__extract_video_info_tiktok(item, data, _("视频"), ) + + def __extract_additional_info( + self, + item: dict, + data: SimpleNamespace, + tiktok=False, + ): + item["height"] = self.safe_extract(data, "video.height", -1) + item["width"] = self.safe_extract(data, "video.width", -1) + item["ratio"] = self.safe_extract(data, "video.ratio") + item["share_url"] = self.__generate_link( + item["type"], + item["id"], + item["unique_id"] if tiktok else None, + ) + + @staticmethod + def __generate_link(type_: str, id_: str, unique_id: str = None, ) -> str: + match bool(unique_id), type_: + case True, "视频": + return f"https://www.tiktok.com/@{unique_id}/video/{id_}" + case True, "图集": + return f"https://www.tiktok.com/@{unique_id}/photo/{id_}" + case False, "视频": + return f"https://www.douyin.com/video/{id_}" + case False, "图集" | "实况": + return f"https://www.douyin.com/note/{id_}" + case _: + return "" + + @staticmethod + def __clean_share_url(url: str) -> str: + if not url: + return url + parsed_url = urlparse(url) + return f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" + + def __extract_image_info( + self, + item: dict, + data: SimpleNamespace, + images: list[SimpleNamespace], + ) -> None: + if self.safe_extract(images[-1], "video"): + self.__set_blank_data(item, data, _("实况"), ) + item["downloads"] = [ + self.__classify_slides_item(i, ) for i in images + ] + else: + self.__set_blank_data(item, data, _("图集"), ) + item["downloads"] = [ + self.safe_extract( + i, + f'url_list[{IMAGE_INDEX}]', + ) for i in images + ] + + def __extract_image_info_tiktok( + self, + item: dict, + data: SimpleNamespace, + images: list) -> None: + self.__set_blank_data(item, data, _("图集"), ) + item["downloads"] = [ + self.safe_extract( + i, + f"imageURL.urlList[{IMAGE_TIKTOK_INDEX}]", + ) for i in images + ] + + def __set_blank_data( + self, + item: dict, + data: SimpleNamespace, + type_=_("图集"), + ): + item["type"] = type_ + item["duration"] = "00:00:00" + item["uri"] = "" + self.__extract_cover(item, data) + + def __extract_video_info( + self, + item: dict, + data: SimpleNamespace, + type_=_("视频"), + ) -> None: + item["type"] = type_ + item["downloads"] = self.__extract_video_download(data, ) + item["duration"] = self.time_conversion( + self.safe_extract(data, "video.duration", 0)) + item["uri"] = self.safe_extract( + data, "video.play_addr.uri") + self.__extract_cover(item, data, True) + + def __classify_slides_item(self, item: SimpleNamespace, ) -> str: + if self.safe_extract(item, "video"): + return self.__extract_video_download(item, ) + return self.safe_extract(item, f'url_list[{IMAGE_INDEX}]') + + def __extract_video_download(self, data: SimpleNamespace, ) -> str: + bit_rate: list[SimpleNamespace] = self.safe_extract( + data, + "video.bit_rate", + [], + ) + bit_rate: list[tuple[int, int, str, list[SimpleNamespace]]] = [( + i.FPS, + i.bit_rate, + i.gear_name, + i.play_addr + ) for i in bit_rate] + bit_rate.sort(key=lambda x: (int(x[2].split("_")[-2]), x[1], x[0],), ) + return self.safe_extract( + bit_rate[-1][-1], + f"url_list[{VIDEO_INDEX}]", + ) if bit_rate else "" + + def __extract_video_info_tiktok( + self, + item: dict, + data: SimpleNamespace, + type_=_("视频"), + ) -> None: + item["type"] = type_ + item["downloads"] = self.safe_extract( + data, "video.playAddr") + item["duration"] = self.time_conversion_tiktok( + self.safe_extract( + data, + "video.duration", + 0, + ) + ) + item["uri"] = self.safe_extract( + data, f"video.bitrateInfo[{BITRATE_INFO_TIKTOK_INDEX}].PlayAddr.Uri", + ) + self.__extract_cover_tiktok(item, data, True) + + @staticmethod + def time_conversion(time_: int) -> str: + second = time_ // 1000 + return f"{second // 3600:0>2d}:{second % 3600 // 60:0>2d}:{second % 3600 % 60:0>2d}" + + @staticmethod + def time_conversion_tiktok(seconds: int) -> str: + minutes, seconds = divmod(seconds, 60) + hours, minutes = divmod(minutes, 60) + return '{:02d}:{:02d}:{:02d}'.format( + int(hours), int(minutes), int(seconds)) + + def __extract_text_extra(self, item: dict, data: SimpleNamespace): + """作品标签""" + text = [ + self.safe_extract(i, "hashtag_name") + for i in self.safe_extract( + data, "text_extra", [] + ) + ] + item["text_extra"] = [i for i in text if i] + + def __extract_text_extra_tiktok(self, item: dict, data: SimpleNamespace): + """作品标签""" + text = [ + self.safe_extract(i, "hashtagName") + for i in self.safe_extract( + data, "textExtra", [] + ) + ] + item["text_extra"] = [i for i in text if i] + + def __extract_cover( + self, + item: dict, + data: SimpleNamespace, + has=False, + ) -> None: + if has: + # 动态封面图链接 + item["dynamic_cover"] = self.safe_extract( + data, f"video.dynamic_cover.url_list[{DYNAMIC_COVER_INDEX}]") + # 静态封面图链接 + item["origin_cover"] = self.safe_extract( + data, f"video.origin_cover.url_list[{ORIGIN_COVER_INDEX}]") + else: + item["dynamic_cover"], item["origin_cover"] = "", "" + + def __extract_cover_tiktok( + self, + item: dict, + data: SimpleNamespace, + has=False, + ) -> None: + if has: + # 动态封面图链接 + item["dynamic_cover"] = self.safe_extract( + data, "video.dynamicCover") + # 静态封面图链接 + item["origin_cover"] = self.safe_extract( + data, "video.originCover") + else: + item["dynamic_cover"], item["origin_cover"] = "", "" + + def __extract_music( + self, + item: dict, + data: SimpleNamespace, + tiktok=False, + ) -> None: + if music_data := self.safe_extract(data, "music"): + if tiktok: + author = self.safe_extract(music_data, "authorName") + title = self.safe_extract(music_data, "title") + url = self.safe_extract( + music_data, "playUrl") + else: + author = self.safe_extract(music_data, "author") + title = self.safe_extract(music_data, "title") + url = self.safe_extract( + music_data, f"play_url.url_list[{MUSIC_INDEX}]", + ) # 部分作品的音乐无法下载 + + else: + author, title, url = "", "", "" + item["music_author"] = author + item["music_title"] = title + item["music_url"] = url + + def __extract_statistics(self, item: dict, data: SimpleNamespace) -> None: + data = self.safe_extract(data, "statistics") + for i in self.statistics_keys: + item[i] = self.safe_extract(data, i, -1, ) + + def __extract_statistics_tiktok( + self, + item: dict, + data: SimpleNamespace) -> None: + data = self.safe_extract(data, "stats") + for i, j in enumerate(self.statistics_keys_tiktok): + item[self.statistics_keys[i]] = self.safe_extract(data, j, -1, ) + + def __extract_tags(self, item: dict, data: SimpleNamespace) -> None: + if not (t := self.safe_extract(data, "video_tag")): + item["tag"] = [] + else: + item["tag"] = [self.safe_extract(i, "tag_name") for i in t] + + def __extract_tags_tiktok(self, item: dict, data: SimpleNamespace) -> None: + if not (t := self.safe_extract(data, "textExtra")): + item["tag"] = [] + else: + item["tag"] = [self.safe_extract(i, "hashtagName") for i in t] + + def __extract_account_info( + self, + container: SimpleNamespace, + data: SimpleNamespace, + key="author", + ) -> None: + data = self.safe_extract(data, key) + container.cache["uid"] = self.safe_extract(data, "uid") + container.cache["sec_uid"] = self.safe_extract(data, "sec_uid") + # container.cache["short_id"] = self.safe_extract(data, "short_id") + container.cache["unique_id"] = self.safe_extract(data, "unique_id", ) + container.cache["signature"] = self.safe_extract(data, "signature") + container.cache["user_age"] = self.safe_extract(data, "user_age", -1) + self.__extract_nickname_info(container, data) + + def __extract_account_info_tiktok( + self, + container: SimpleNamespace, + data: SimpleNamespace, + key="author", + ) -> None: + data = self.safe_extract(data, key) + container.cache["uid"] = self.safe_extract(data, "id") + container.cache["sec_uid"] = self.safe_extract(data, "secUid") + container.cache["unique_id"] = self.safe_extract(data, "uniqueId") + container.cache["signature"] = self.safe_extract(data, "signature") + container.cache["user_age"] = -1 + self.__extract_nickname_info(container, data) + + def __extract_nickname_info( + self, + container: SimpleNamespace, + data: SimpleNamespace, + ) -> None: + if container.same: + container.cache["nickname"] = container.name + container.cache["mark"] = container.mark or container.name + else: + name = self.cleaner.filter_name( + self.safe_extract( + data, + "nickname", + _("已注销账号")), + default=_("无效账号昵称"), + ) + container.cache["nickname"] = name + container.cache["mark"] = name + + def preprocessing_data( + self, + data: list[dict] | dict, + tiktok: bool = False, + mode: str = ..., + mark: str = "", + user_id: str = "", + mix_id: str = "", + mix_title: str = "", + collect_id: str = "", + collect_name: str = "", + ) -> tuple[str, str, str,]: + if isinstance(data, dict): + info = self.get_user_info_tiktok(data) if tiktok else self.get_user_info(data) + if user_id != (s := info.get("sec_uid")): + self.log.error( + _("sec_user_id {user_id} 与 {s} 不一致").format(user_id=user_id, s=s), + ) + return "", "", "" + name = self.cleaner.filter_name( + info["nickname"], + info["uid"], + ) + mark = self.cleaner.filter_name( + mark, + name, + ) + return ( + info["uid"], + name, + mark, + ) + elif isinstance(data, list): + match mode: + case "post": + item = self.__select_item( + data, + user_id, + (self.extract_params_tiktok if tiktok else self.extract_params)["sec_uid"], + ) + id_, name, mark = self.__extract_pretreatment_data( + item, + (self.extract_params_tiktok if tiktok else self.extract_params)["uid"], + (self.extract_params_tiktok if tiktok else self.extract_params)["nickname"], + mark, + ) + return id_, name, mark + case "mix": + item = self.__select_item( + data, + mix_id, + (self.extract_params_tiktok if tiktok else self.extract_params)["mix_id"], + ) + id_, name, mark = self.__extract_pretreatment_data( + item, + (self.extract_params_tiktok if tiktok else self.extract_params)["mix_id"], + (self.extract_params_tiktok if tiktok else self.extract_params)["mix_title"], + mark, + mix_title, + ) + return id_, name, mark + case "favorite" | "collection": + pass + case "collects": + collect_name = self.cleaner.filter_name( + collect_name, + collect_id, + ) + return collect_id, collect_name, collect_name + else: + raise TikTokDownloaderError + + def __select_item(self, data: list[dict], id_: str, key: str): + """从多个数据返回对象""" + for item in data: + item = self.generate_data_object(item) + if id_ == self.safe_extract(item, key): + return item + raise TikTokDownloaderError(_("提取账号信息或合集信息失败,请向作者反馈!")) + + def __extract_pretreatment_data( + self, + item: SimpleNamespace, + id_: str, + name: str, + mark: str, + title: str = None, # TikTok 合辑需要直接传入标题 + ): + id_ = self.safe_extract(item, id_) + name = self.cleaner.filter_name( + title or self.safe_extract( + item, + name, + id_, + ), + ) + mark = self.cleaner.filter_name(mark, name, ) + return id_, name.strip(), mark.strip() + + def __platform_classify_detail( + self, + data: list[dict], + container: SimpleNamespace, + tiktok: bool) -> None: + if tiktok: + [ + self.__extract_batch_tiktok( + container, + self.generate_data_object(item), + ) for item in data + ] + else: + [ + self.__extract_batch( + container, + self.generate_data_object(item), + ) + for item in data + ] + + async def __detail( + self, + data: list[dict], + recorder, + tiktok: bool, + ) -> list[dict]: + container = SimpleNamespace( + all_data=[], + template={ + "collection_time": datetime.now().strftime(self.date_format), + }, + cache=None, + same=False, + ) + self.__platform_classify_detail(data, container, tiktok, ) + container.all_data = self.__clean_extract_data( + container.all_data, self.detail_necessary_keys) + self.__extract_item_records(container.all_data) + await self.__record_data(recorder, container.all_data) + self.__condition_filter(container) + return container.all_data + + async def __comment(self, data: list[dict], recorder, tiktok: bool, + source=False) -> list[dict]: + if not any(data): + return [] + container = SimpleNamespace( + all_data=[], + template={ + "collection_time": datetime.now().strftime(self.date_format), + }, + cache=None, + same=False, + ) + if source: + container.all_data = data + else: + [self.__extract_comments_data( + container, self.generate_data_object(i)) for i in data] + container.all_data = self.__clean_extract_data( + container.all_data, self.comment_necessary_keys) + await self.__record_data(recorder, container.all_data) + return container.all_data + + def __extract_comments_data( + self, + container: SimpleNamespace, + data: SimpleNamespace): + container.cache = container.template.copy() + container.cache["create_timestamp"] = self.safe_extract( + data, "create_time") + container.cache["create_time"] = self.__format_date( + container.cache["create_timestamp"]) + container.cache["ip_label"] = self.safe_extract(data, "ip_label", "未知") + container.cache["text"] = self.safe_extract(data, "text") + container.cache["image"] = self.safe_extract( + data, f"image_list[{COMMENT_IMAGE_LIST_INDEX}].origin_url.url_list[{COMMENT_IMAGE_INDEX}]") + container.cache["sticker"] = self.safe_extract( + data, f"sticker.static_url.url_list[{COMMENT_STICKER_INDEX}]") + container.cache["digg_count"] = self.safe_extract(data, "digg_count", -1) + container.cache["reply_to_reply_id"] = self.safe_extract( + data, "reply_to_reply_id") + container.cache["reply_comment_total"] = self.safe_extract(data, "reply_comment_total", 0) + container.cache["reply_id"] = self.safe_extract(data, "reply_id") + container.cache["cid"] = self.safe_extract(data, "cid") + self.__extract_account_info(container, data, "user") + container.all_data.append(container.cache) + + @classmethod + def extract_reply_ids(cls, data: list[dict]) -> list[str]: + container = SimpleNamespace( + reply_ids=[], + cache=None, + ) + for item in data: + item = cls.generate_data_object(item) + container.cache = { + "reply_comment_total": cls.safe_extract( + item, + "reply_comment_total", + 0, + ), + "cid": cls.safe_extract(item, "cid"), + } + cls.__filter_reply_ids(container) + return container.reply_ids + + @staticmethod + def __filter_reply_ids(container: SimpleNamespace): + if container.cache["reply_comment_total"] > 0: + container.reply_ids.append(container.cache["cid"]) + + async def __live( + self, + data: list[dict], + recorder, + tiktok: bool, + *args) -> list[dict]: + container = SimpleNamespace(all_data=[]) + if tiktok: + [self.__extract_live_data_tiktok( + container, self.generate_data_object(i)) for i in data] + else: + [self.__extract_live_data( + container, self.generate_data_object(i)) for i in data] + return container.all_data + + def __extract_live_data( + self, + container: SimpleNamespace, + data: SimpleNamespace): + data = self.safe_extract( + data, f"data.data[{LIVE_DATA_INDEX}]") or self.safe_extract( + data, "data.room") + live_data = { + "status": self.safe_extract(data, "status"), + "nickname": self.safe_extract(data, "owner.nickname"), + "title": self.safe_extract(data, "title"), + "flv_pull_url": vars( + self.safe_extract( + data, + "stream_url.flv_pull_url", + SimpleNamespace(), + ) + ), + "hls_pull_url_map": vars( + self.safe_extract( + data, + "stream_url.hls_pull_url_map", + SimpleNamespace(), + ) + ), + "cover": self.safe_extract(data, f"cover.url_list[{LIVE_COVER_INDEX}]"), + "total_user_str": self.safe_extract(data, "stats.total_user_str"), + "user_count_str": self.safe_extract(data, "stats.user_count_str"), + } + container.all_data.append(live_data) + + def __extract_live_data_tiktok( + self, + container: SimpleNamespace, + data: SimpleNamespace): + data = self.safe_extract(data, "data") + live_data = { + "create_time": datetime.fromtimestamp(t) if ( + t := self.safe_extract(data, "create_time")) else "未知", + "id_str": self.safe_extract(data, "id_str"), + "like_count": self.safe_extract(data, "like_count"), + "nickname": self.safe_extract(data, "owner.nickname"), + "display_id": self.safe_extract(data, "owner.display_id"), + "title": self.safe_extract(data, "title"), + "user_count": self.safe_extract(data, "user_count"), + "flv_pull_url": vars(self.safe_extract(data, "stream_url.flv_pull_url")), + "message": self.safe_extract(data, "message"), + "prompts": self.safe_extract(data, "prompts"), + } + container.all_data.append(live_data) + + async def __user( + self, + data: list[dict], + recorder, + tiktok: bool, + ) -> list[dict]: + container = SimpleNamespace( + all_data=[], + cache=None, + template={ + "collection_time": datetime.now().strftime(self.date_format), + }, + ) + extract = self.__extract_tiktok_user_data if tiktok else self.__extract_user_data + + [extract(container,self.generate_data_object(i)) for i in data] + if not tiktok: + container.all_data = self.__clean_extract_data( + container.all_data, self.user_necessary_keys) + await self.__record_data(recorder, container.all_data) + return container.all_data + + def __extract_user_data( + self, + container: SimpleNamespace, + data: SimpleNamespace, + ): + container.cache = container.template.copy() + container.cache["avatar"] = self.safe_extract( + data, f"avatar_larger.url_list[{AVATAR_LARGER_INDEX}]") + container.cache["city"] = self.safe_extract(data, "city") + container.cache["country"] = self.safe_extract(data, "country") + container.cache["district"] = self.safe_extract(data, "district") + container.cache["favoriting_count"] = self.safe_extract(data, "favoriting_count", -1) + container.cache["follower_count"] = self.safe_extract(data, "follower_count", -1) + container.cache["max_follower_count"] = self.safe_extract(data, "max_follower_count", -1) + container.cache["following_count"] = self.safe_extract(data, "following_count", -1) + container.cache["total_favorited"] = self.safe_extract(data, "total_favorited", -1) + container.cache["gender"] = {1: "男", 2: "女"}.get( + self.safe_extract(data, "gender"), + "未知", + ) + container.cache["ip_location"] = self.safe_extract(data, "ip_location") + container.cache["nickname"] = self.safe_extract(data, "nickname") + container.cache["province"] = self.safe_extract(data, "province") + container.cache["school_name"] = self.safe_extract(data, "school_name") + container.cache["sec_uid"] = self.safe_extract(data, "sec_uid") + container.cache["signature"] = self.safe_extract(data, "signature") + container.cache["uid"] = self.safe_extract(data, "uid") + container.cache["unique_id"] = self.safe_extract(data, "unique_id") + container.cache["user_age"] = self.safe_extract(data, "user_age", -1) + container.cache["cover"] = self.safe_extract( + data, f"cover_url[{AUTHOR_COVER_URL_INDEX}].url_list[{AUTHOR_COVER_INDEX}]") + container.cache["short_id"] = self.safe_extract(data, "short_id") + container.cache["aweme_count"] = self.safe_extract(data, "aweme_count", -1) + container.cache["verify"] = self.safe_extract( + data, "custom_verify", "无") + container.cache["enterprise"] = self.safe_extract( + data, "enterprise_verify_reason", "无") + container.cache["url"] = f"https://www.douyin.com/user/{container.cache["sec_uid"]}" + container.all_data.append(container.cache) + + def __extract_tiktok_user_data( + self, + container: SimpleNamespace, + data: SimpleNamespace, + ): + container.cache = container.template.copy() + # Basic user info + container.cache["collection_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + container.cache["nickname"] = self.safe_extract(data, "nickname") + container.cache["signature"] = self.safe_extract(data, "signature") + container.cache["uniqueId"] = self.safe_extract(data, "uniqueId") + container.cache["url"] = f"https://www.tiktok.com/{container.cache['uniqueId']}" + + # Avatar URLs + container.cache["avatarThumb"] = self.safe_extract(data, "avatarThumb") + container.cache["avatarMedium"] = self.safe_extract(data, "avatarMedium") + container.cache["avatarLarger"] = self.safe_extract(data, "avatarLarger") + + # User identifiers + container.cache["secUid"] = self.safe_extract(data, "secUid") + + # Numeric metrics + container.cache["link"] = self.safe_extract(data, "link", -1) + container.cache["risk"] = self.safe_extract(data, "risk", -1) + container.cache["videoCount"] = self.safe_extract(data, "videoCount", -1) + container.cache["heartCount"] = self.safe_extract(data, "heartCount", -1) + container.cache["followerCount"] = self.safe_extract(data, "followerCount", -1) + container.cache["followingCount"] = self.safe_extract(data, "followingCount", -1) + container.cache["stitchSetting"] = self.safe_extract(data, "stitchSetting", 0) + + # Boolean flags + container.cache["isEmbedBanned"] = self.safe_extract(data, "isEmbedBanned", False) + container.cache["isADVirtual"] = self.safe_extract(data, "isADVirtual", False) + container.cache["openFavorite"] = self.safe_extract(data, "openFavorite", False) + container.cache["privateAccount"] = self.safe_extract(data, "privateAccount", False) + container.cache["canExpPlaylist"] = self.safe_extract(data, "canExpPlaylist", False) + container.cache["verified"] = self.safe_extract(data, "verified", False) + container.cache["ttSeller"] = self.safe_extract(data, "ttSeller", False) + + container.all_data.append(container.cache) + async def __search( + self, + data: list[dict], + recorder, + tiktok: bool, + tab: int) -> list[dict]: + if tab in {0, 1}: + return await self.__search_general(data, recorder) + elif tab == 2: + return await self.__search_user(data, recorder) + elif tab == 3: + return await self.__search_live(data, recorder) + + async def __search_general(self, data: list[dict], recorder) -> list[dict]: + container = SimpleNamespace( + all_data=[], + cache=None, + template={ + "collection_time": datetime.now().strftime(self.date_format), + }, + same=False, + ) + [self.__search_result_classify( + container, self.generate_data_object(i)) for i in data] + await self.__record_data(recorder, container.all_data) + return container.all_data + + def __search_result_classify( + self, + container: SimpleNamespace, + data: SimpleNamespace): + if d := self.safe_extract(data, "aweme_info"): + self.__extract_batch(container, d) + elif d := self.safe_extract(data, "aweme_mix_info.mix_items"): + [self.__extract_batch(container, i) for i in d] + elif d := self.safe_extract(data, "card_info.attached_info.aweme_list"): + [self.__extract_batch(container, i) for i in d] + elif d := self.safe_extract(data, f"user_list[{SEARCH_USER_INDEX}].items"): + [self.__extract_batch(container, i) for i in d] + # elif d := self.safe_extract(data, "user_list.user_info"): + # pass + # elif d := self.safe_extract(data, "music_list"): + # pass + # elif d := self.safe_extract(data, "common_aladdin"): + # pass + self.log.error(f"Unreported search results: {data}", False) + + async def __search_user( + self, + data: list[dict], + recorder) -> list[dict]: + container = SimpleNamespace( + all_data=[], + cache=None, + template={ + "collection_time": datetime.now().strftime(self.date_format), + }, + ) + [self.__deal_search_user_live(container, self.generate_data_object( + i["user_info"])) for i in data] + await self.__record_data(recorder, container.all_data) + return container.all_data + + def __deal_search_user_live( + self, + container: SimpleNamespace, + data: SimpleNamespace, + user=True, + ): + if user: + container.cache = container.template.copy() + container.cache["avatar"] = self.safe_extract( + data, f"{'avatar_thumb' if user else 'avatar_larger'}.url_list[{SEARCH_AVATAR_INDEX}]") + container.cache["nickname"] = self.safe_extract(data, "nickname") + container.cache["sec_uid"] = self.safe_extract(data, "sec_uid") + container.cache["signature"] = self.safe_extract(data, "signature") + container.cache["uid"] = self.safe_extract(data, "uid") + container.cache["short_id"] = self.safe_extract(data, "short_id") + container.cache["verify"] = self.safe_extract( + data, "custom_verify", "无") + container.cache["enterprise"] = self.safe_extract( + data, "enterprise_verify_reason", "无") + if user: + container.cache["follower_count"] = self.safe_extract(data, "follower_count", -1) + container.cache["total_favorited"] = self.safe_extract(data, "total_favorited", -1) + container.cache["unique_id"] = self.safe_extract(data, "unique_id") + container.all_data.append(container.cache) + # else: + # pass + + async def __search_live( + self, + data: list[dict], + recorder) -> list[dict]: + container = SimpleNamespace( + all_data=[], + cache=None, + template={ + "collection_time": datetime.now().strftime(self.date_format), + }, + ) + [self.__deal_search_live( + container, self.generate_data_object(i["lives"])) for i in data] + await self.__record_data(recorder, container.all_data) + return container.all_data + + def __deal_search_live(self, + container: SimpleNamespace, + data: SimpleNamespace): + container.cache = container.template.copy() + self.__deal_search_user_live( + container, self.safe_extract( + data, "author"), False) + container.cache["room_id"] = self.safe_extract(data, "aweme_id") + container.all_data.append(container.cache) + + async def __hot( + self, + data: list[dict], + recorder, + tiktok: bool, + ) -> list[dict]: + all_data = [] + [self.__deal_hot_data(all_data, self.generate_data_object(i)) + for i in data] + await self.__record_data(recorder, all_data) + return all_data + + def __deal_hot_data(self, container: list, data: SimpleNamespace): + cache = { + "position": str(self.safe_extract(data, "position", -1)), + "sentence_id": self.safe_extract(data, "sentence_id"), + "word": self.safe_extract(data, "word"), + "video_count": str(self.safe_extract(data, "video_count", -1)), + "event_time": self.__format_date(self.safe_extract(data, "event_time")), + "view_count": str(self.safe_extract(data, "view_count", -1)), + "hot_value": str(self.safe_extract(data, "hot_value", -1)), + "cover": self.safe_extract(data, f"word_cover.url_list[{HOT_WORD_COVER_INDEX}]"), + } + container.append(cache) + + async def __record_data(self, record, data: list[dict]): + # 记录数据 + for i in data: + await record.save(self.__extract_values(record, i)) + + @staticmethod + def __extract_values(record, data: dict) -> list: + return [str(value).replace('\n', ' ') for key in record.field_keys for value in (data[key],)] + + @staticmethod + def __date_filter(container: SimpleNamespace): + # print("前", len(container.all_data)) # 调试代码 + result = [] + for item in container.all_data: + create_time = datetime.fromtimestamp( + item["create_timestamp"]).date() + if container.earliest <= create_time <= container.latest: + result.append(item) + # else: + # print("丢弃", item) # 调试代码 + # print("后", len(result)) # 调试代码 + container.all_data = result + + def source_date_filter( + self, + data: list[dict], + earliest: "date", + latest: "date", + tiktok=False, + ) -> list[dict]: + if tiktok: + return self.__source_date_filter( + data, + "createTime", + earliest=earliest, + latest=latest, + ) + return self.__source_date_filter( + data, + earliest=earliest, + latest=latest, + ) + + def __source_date_filter( + self, + data: list[dict], + key: str = "create_time", + earliest: "date" = ..., + latest: "date" = ..., + ) -> list[dict]: + result = [] + for item in data: + if not (create_time := item.get(key, 0)): + result.append(item) + continue + create_time = datetime.fromtimestamp(create_time).date() + if earliest <= create_time <= latest: + result.append(item) + self.__summary_detail(result) + return result + + @classmethod + def extract_mix_id(cls, data: dict) -> str: + data = cls.generate_data_object(data) + return cls.safe_extract(data, "mix_info.mix_id") + + def __extract_item_records(self, data: list[dict]): + # 记录提取成功的条目 + for i in data: + self.log.info(f"{i['type']} {i['id']} 数据提取成功", False) + + @classmethod + def extract_mix_collect_info(cls, data: list[dict]) -> list[dict]: + data = cls.generate_data_object(data) + return [ + { + "title": Extractor.safe_extract(i, "mix_name"), + "id": Extractor.safe_extract(i, "mix_id"), + } + for i in data + ] + + @classmethod + def extract_collects_info(cls, data: list[dict]) -> list[dict]: + data = cls.generate_data_object(data) + return [ + { + "name": Extractor.safe_extract(i, "collects_name"), + "id": Extractor.safe_extract(i, "collects_id_str"), + } + for i in data + ] + + @staticmethod + def __clean_extract_data(data: list[dict], key: str) -> list[dict]: + # 去除无效数据 + return [i for i in data if i.get(key)] + + async def __music( + self, + data: list[dict], + recorder, + tiktok=False, + ) -> list[dict]: + """暂不记录收藏音乐数据""" + container = SimpleNamespace( + all_data=[], + template={ + "collection_time": datetime.now().strftime(self.date_format), + }, + cache=None, + same=False, + ) + [ + self.__extract_collection_music( + container, + self.generate_data_object(item), + ) for item in data + ] + return container.all_data + + def __extract_collection_music( + self, + container: SimpleNamespace, + data: SimpleNamespace, + ): + container.cache = container.template.copy() + container.cache["id"] = self.safe_extract(data, "id_str") + container.cache["title"] = self.safe_extract(data, "title") + container.cache["author"] = self.safe_extract(data, "author") + container.cache["album"] = self.safe_extract(data, "album") + container.cache["cover"] = self.safe_extract( + data, f"cover_hd.url_list[{MUSIC_COLLECTION_COVER_INDEX}]") + container.cache["download"] = self.safe_extract( + data, f"play_url.url_list[{MUSIC_COLLECTION_DOWNLOAD_INDEX}]") + container.cache["duration"] = self.time_conversion( + self.safe_extract(data, "duration", 0)) + container.all_data.append(container.cache) diff --git a/src/interface/user.py b/src/interface/user.py index b24c6f3f..8802f986 100644 --- a/src/interface/user.py +++ b/src/interface/user.py @@ -1,89 +1,90 @@ -from typing import TYPE_CHECKING, Callable, Type, Coroutine -from typing import Union - -from src.interface.template import API -from src.testers import Params -from src.translation import _ - -if TYPE_CHECKING: - from src.config import Parameter - - -class User(API): - def __init__( - self, - params: Union["Parameter", Params], - cookie: str = None, - proxy: str = None, - sec_user_id: str = ..., - *args, - **kwargs, - ): - super().__init__(params, cookie, proxy, *args, **kwargs, ) - self.sec_user_id = sec_user_id - self.api = f"{self.domain}aweme/v1/web/user/profile/other/" - self.text = _("账号简略") - - async def run(self, *args, **kwargs): - return await super().run( - single_page=True, - data_key="user", - ) - - async def run_batch( - self, - data_key: str, - error_text="", - cursor="cursor", - has_more="has_more", - params: Callable = lambda: {}, - data: Callable = lambda: {}, - method="GET", - headers: dict = None, - callback: Type[Coroutine] = None, - *args, - **kwargs, ): - pass - - def check_response( - self, - data_dict: dict, - data_key: str, - error_text="", - *args, - **kwargs, - ): - try: - if not (d := data_dict[data_key]): - self.log.warning(error_text) - else: - self.response = d - except KeyError: - self.log.error(_("数据解析失败,请告知作者处理: {data}").format(data=data_dict)) - self.finished = True - - def generate_params(self, ) -> dict: - return self.params | { - "publish_video_strategy_type": "2", - "sec_user_id": self.sec_user_id, - "personal_center_strategy": "1", - "profile_other_record_enable": "1", - "land_to": "1", - "version_code": "170400", - "version_name": "17.4.0", - } - - -async def test(): - async with Params() as params: - i = User( - params, - sec_user_id="", - ) - print(await i.run()) - - -if __name__ == "__main__": - from asyncio import run - - run(test()) +from typing import TYPE_CHECKING, Callable, Type, Coroutine +from typing import Union + +from src.interface.template import API +from src.testers import Params +from src.translation import _ + +if TYPE_CHECKING: + from src.config import Parameter + + +class User(API): + def __init__( + self, + params: Union["Parameter", Params], + cookie: str = None, + proxy: str = None, + sec_user_id: str = ..., + data_key: str = "user", + *args, + **kwargs, + ): + super().__init__(params, cookie, proxy, *args, **kwargs, ) + self.sec_user_id = sec_user_id + self.api = f"{self.domain}aweme/v1/web/user/profile/other/" + self.text = _("账号简略") + self.data_key = data_key + async def run(self, *args, **kwargs): + return await super().run( + single_page=True, + data_key=self.data_key, + ) + + async def run_batch( + self, + data_key: str, + error_text="", + cursor="cursor", + has_more="has_more", + params: Callable = lambda: {}, + data: Callable = lambda: {}, + method="GET", + headers: dict = None, + callback: Type[Coroutine] = None, + *args, + **kwargs, ): + pass + + def check_response( + self, + data_dict: dict, + data_key: str, + error_text="", + *args, + **kwargs, + ): + try: + if not (d := data_dict[data_key]): + self.log.warning(error_text) + else: + self.response = d + except KeyError: + self.log.error(_("数据解析失败,请告知作者处理: {data}").format(data=data_dict)) + self.finished = True + + def generate_params(self, ) -> dict: + return self.params | { + "publish_video_strategy_type": "2", + "sec_user_id": self.sec_user_id, + "personal_center_strategy": "1", + "profile_other_record_enable": "1", + "land_to": "1", + "version_code": "170400", + "version_name": "17.4.0", + } + + +async def test(): + async with Params() as params: + i = User( + params, + sec_user_id="", + ) + print(await i.run()) + + +if __name__ == "__main__": + from asyncio import run + + run(test()) diff --git a/src/interface/user_tiktok.py b/src/interface/user_tiktok.py new file mode 100644 index 00000000..713d4126 --- /dev/null +++ b/src/interface/user_tiktok.py @@ -0,0 +1,85 @@ +import time +from datetime import datetime +from typing import TYPE_CHECKING, Callable, Type, Coroutine, Union + +from src.interface.user import User +from src.interface.template import APITikTok +from src.testers import Params + +if TYPE_CHECKING: + from src.config import Parameter + + +class UserTikTok(User, APITikTok): + def __init__( + self, + params: Union["Parameter", Params], + cookie: str | dict = None, + proxy: str = None, + sec_user_id: str = ..., + data_key: str = "userInfo", + uniqueId: str = "", + *args, + **kwargs, + ): + super().__init__( + params, + cookie, + proxy, + sec_user_id, + *args, + **kwargs, + ) + self.api = f"{APITikTok.domain}api/user/detail/" + self.data_key = data_key + self.uniqueId = uniqueId + + def generate_params(self) -> dict: + return self.params | { + "uniqueId": self.uniqueId, + "priority_region": "", + } + + async def run( + self, + referer: str = None, + data_key: str = "userInfo", + error_text="", + *args, + **kwargs + ): + self.set_referer(referer or f"{self.domain}@{self.sec_user_id}") + + data = await super().run( + single_page=True, + data_key=self.data_key, + error_text=error_text, + *args, + **kwargs + ) + + def flatten_dict(d, sep='_'): + items = [] + for k, v in d.items(): + if isinstance(v, dict): + items.extend(flatten_dict(v, sep=sep).items()) + else: + items.append((k, v)) + return dict(items) + data = flatten_dict(data) + return data + + +async def test(): + async with Params() as params: + i = UserTikTok( + params, + sec_user_id="", + ) + print(await i.run()) + + +if __name__ == "__main__": + from asyncio import run + + run(test()) \ No newline at end of file diff --git a/src/storage/manager.py b/src/storage/manager.py index e058946c..674eb333 100644 --- a/src/storage/manager.py +++ b/src/storage/manager.py @@ -1,223 +1,258 @@ -from typing import TYPE_CHECKING - -from .csv import CSVLogger -from .sqlite import SQLLogger -from .text import BaseTextLogger -from .xlsx import XLSXLogger - -if TYPE_CHECKING: - from ..config import Parameter - -__all__ = ["RecordManager"] - - -class RecordManager: - """检查数据储存路径和文件夹""" - detail = ( - ("type", "作品类型", "TEXT",), - ("collection_time", "采集时间", "TEXT",), - ("uid", "UID", "TEXT",), - ("sec_uid", "SEC_UID", "TEXT",), - ("unique_id", "ID", "TEXT",), - # ("short_id", "SHORT_ID", "TEXT",), - ("id", "作品ID", "TEXT",), - ("desc", "作品描述", "TEXT",), - ("text_extra", "作品话题", "TEXT",), - ("duration", "视频时长", "TEXT",), - ("ratio", "视频分辨率", "TEXT",), - ("height", "视频高度", "INTEGER",), - ("width", "视频宽度", "INTEGER",), - ("share_url", "作品链接", "TEXT",), - ("create_time", "发布时间", "TEXT",), - ("uri", "视频URI", "TEXT",), - ("nickname", "账号昵称", "TEXT",), - ("user_age", "年龄", "INTEGER",), - ("signature", "账号签名", "TEXT",), - ("downloads", "下载地址", "TEXT",), - ("music_author", "音乐作者", "TEXT",), - ("music_title", "音乐标题", "TEXT",), - ("music_url", "音乐链接", "TEXT",), - ("origin_cover", "静态封面", "TEXT",), - ("dynamic_cover", "动态封面", "TEXT",), - ("tag", "隐藏标签", "TEXT",), - ("digg_count", "点赞数量", "INTEGER",), - ("comment_count", "评论数量", "INTEGER",), - ("collect_count", "收藏数量", "INTEGER",), - ("share_count", "分享数量", "INTEGER",), - ("play_count", "播放数量", "INTEGER",), - ("extra", "额外信息", "TEXT",), - ) - comment = ( - ("collection_time", "采集时间", "TEXT",), - ("cid", "评论ID", "TEXT",), - ("create_time", "评论时间", "TEXT",), - ("uid", "UID", "TEXT",), - ("sec_uid", "SEC_UID", "TEXT",), - # ("short_id", "SHORT_ID", "TEXT",), - # ("unique_id", "抖音号", "TEXT",), - ("nickname", "账号昵称", "TEXT",), - ("signature", "账号签名", "TEXT",), - ("user_age", "年龄", "INTEGER",), - ("ip_label", "IP归属地", "TEXT",), - ("text", "评论内容", "TEXT",), - ("sticker", "评论表情", "TEXT",), - ("image", "评论图片", "TEXT",), - ("digg_count", "点赞数量", "INTEGER",), - ("reply_comment_total", "回复数量", "INTEGER",), - ("reply_id", "回复ID", "TEXT",), - ("reply_to_reply_id", "回复对象", "TEXT",), - ) - user = ( - ("collection_time", "采集时间", "TEXT",), - ("nickname", "昵称昵称", "TEXT",), - ("url", "账号链接", "TEXT",), - ("signature", "账号签名", "TEXT",), - ("unique_id", "抖音号", "TEXT",), - ("user_age", "年龄", "INTEGER",), - ("gender", "性别", "TEXT",), - ("country", "国家", "TEXT",), - ("province", "省份", "TEXT",), - ("city", "城市", "TEXT",), - ("district", "地区", "TEXT",), - ("ip_location", "IP归属地", "TEXT",), - ("verify", "标签", "TEXT",), - ("enterprise", "企业", "TEXT",), - ("sec_uid", "SEC_UID", "TEXT",), - ("uid", "UID", "TEXT",), - ("short_id", "SHORT_ID", "TEXT",), - ("avatar", "头像链接", "TEXT",), - ("cover", "背景图链接", "TEXT",), - ("aweme_count", "作品数量", "INTEGER",), - ("total_favorited", "获赞数量", "INTEGER",), - ("favoriting_count", "喜欢数量", "INTEGER",), - ("follower_count", "粉丝数量", "INTEGER",), - ("following_count", "关注数量", "INTEGER",), - ("max_follower_count", "粉丝最大值", "INTEGER",), - ) - search_user = ( - ("collection_time", "采集时间", "TEXT",), - ("uid", "UID", "TEXT",), - ("sec_uid", "SEC_UID", "TEXT",), - ("nickname", "账号昵称", "TEXT",), - ("unique_id", "抖音号", "TEXT",), - ("short_id", "SHORT_ID", "TEXT",), - ("avatar", "头像链接", "TEXT",), - ("signature", "账号签名", "TEXT",), - ("verify", "标签", "TEXT",), - ("enterprise", "企业", "TEXT",), - ("follower_count", "粉丝数量", "INTEGER",), - ("total_favorited", "获赞数量", "INTEGER",), - ) - search_live = ( - ("collection_time", "采集时间", "TEXT",), - ("room_id", "直播ID", "TEXT",), - ("uid", "UID", "TEXT",), - ("sec_uid", "SEC_UID", "TEXT",), - ("nickname", "账号昵称", "TEXT",), - ("short_id", "SHORT_ID", "TEXT",), - ("avatar", "头像链接", "TEXT",), - ("signature", "账号签名", "TEXT",), - ("verify", "标签", "TEXT",), - ("enterprise", "企业", "TEXT",), - ) - hot = ( - ("position", "排名", "INTEGER",), - ("word", "内容", "TEXT",), - ("hot_value", "热度", "INTEGER",), - ("cover", "封面", "TEXT",), - ("event_time", "时间", "TEXT",), - ("view_count", "浏览数量", "INTEGER",), - ("video_count", "视频数量", "INTEGER",), - ("sentence_id", "SENTENCE_ID", "TEXT",), - ) - - detail_keys = [i[0] for i in detail] - detail_name = [i[1] for i in detail] - detail_type = [i[2] for i in detail] - comment_keys = [i[0] for i in comment] - comment_name = [i[1] for i in comment] - comment_type = [i[2] for i in comment] - user_keys = [i[0] for i in user] - user_name = [i[1] for i in user] - user_type = [i[2] for i in user] - search_user_keys = [i[0] for i in search_user] - search_user_name = [i[1] for i in search_user] - search_user_type = [i[2] for i in search_user] - search_live_keys = [i[0] for i in search_live] - search_live_name = [i[1] for i in search_live] - search_live_type = [i[2] for i in search_live] - hot_keys = [i[0] for i in hot] - hot_name = [i[1] for i in hot] - hot_type = [i[2] for i in hot] - - LoggerParams = { - "detail": { - "db_name": "DetailData.db", - "title_line": detail_name, - "title_type": detail_type, - "field_keys": detail_keys, - }, - "comment": { - "db_name": "CommentData.db", - "title_line": comment_name, - "title_type": comment_type, - "field_keys": comment_keys, - }, - "user": { - "db_name": "UserData.db", - "title_line": user_name, - "title_type": user_type, - "field_keys": user_keys, - }, - "mix": { - "db_name": "MixData.db", - "title_line": detail_name, - "title_type": detail_type, - "field_keys": detail_keys, - }, - "search_general": { - "db_name": "SearchData.db", - "title_line": detail_name, - "title_type": detail_type, - "field_keys": detail_keys, - }, - "search_user": { - "db_name": "SearchData.db", - "title_line": search_user_name, - "title_type": search_user_type, - "field_keys": search_user_keys, - }, - "search_live": { - "db_name": "SearchData.db", - "title_line": search_live_name, - "title_type": search_live_type, - "field_keys": search_live_keys, - }, - "hot": { - "db_name": "BoardData.db", - "title_line": hot_name, - "title_type": hot_type, - "field_keys": hot_keys, - }, - } - DataLogger = { - "csv": CSVLogger, - "xlsx": XLSXLogger, - "sql": SQLLogger, - # "mysql": BaseTextLogger, - } - - def run( - self, - parameter: "Parameter", - folder="", - type_="detail", - blank=False, - ): - root = parameter.root.joinpath( - parameter.CLEANER.filter_name(folder, "Data")) - root.mkdir(exist_ok=True) - params = self.LoggerParams[type_] - logger = BaseTextLogger if blank else self.DataLogger.get( - parameter.storage_format, BaseTextLogger) - return root, params, logger +from typing import TYPE_CHECKING + +from .csv import CSVLogger +from .sqlite import SQLLogger +from .text import BaseTextLogger +from .xlsx import XLSXLogger + +if TYPE_CHECKING: + from ..config import Parameter + +__all__ = ["RecordManager"] + + +class RecordManager: + """检查数据储存路径和文件夹""" + detail = ( + ("type", "作品类型", "TEXT",), + ("collection_time", "采集时间", "TEXT",), + ("uid", "UID", "TEXT",), + ("sec_uid", "SEC_UID", "TEXT",), + ("unique_id", "ID", "TEXT",), + # ("short_id", "SHORT_ID", "TEXT",), + ("id", "作品ID", "TEXT",), + ("desc", "作品描述", "TEXT",), + ("text_extra", "作品话题", "TEXT",), + ("duration", "视频时长", "TEXT",), + ("ratio", "视频分辨率", "TEXT",), + ("height", "视频高度", "INTEGER",), + ("width", "视频宽度", "INTEGER",), + ("share_url", "作品链接", "TEXT",), + ("create_time", "发布时间", "TEXT",), + ("uri", "视频URI", "TEXT",), + ("nickname", "账号昵称", "TEXT",), + ("user_age", "年龄", "INTEGER",), + ("signature", "账号签名", "TEXT",), + ("downloads", "下载地址", "TEXT",), + ("music_author", "音乐作者", "TEXT",), + ("music_title", "音乐标题", "TEXT",), + ("music_url", "音乐链接", "TEXT",), + ("origin_cover", "静态封面", "TEXT",), + ("dynamic_cover", "动态封面", "TEXT",), + ("tag", "隐藏标签", "TEXT",), + ("digg_count", "点赞数量", "INTEGER",), + ("comment_count", "评论数量", "INTEGER",), + ("collect_count", "收藏数量", "INTEGER",), + ("share_count", "分享数量", "INTEGER",), + ("play_count", "播放数量", "INTEGER",), + ("extra", "额外信息", "TEXT",), + ) + comment = ( + ("collection_time", "采集时间", "TEXT",), + ("cid", "评论ID", "TEXT",), + ("create_time", "评论时间", "TEXT",), + ("uid", "UID", "TEXT",), + ("sec_uid", "SEC_UID", "TEXT",), + # ("short_id", "SHORT_ID", "TEXT",), + # ("unique_id", "抖音号", "TEXT",), + ("nickname", "账号昵称", "TEXT",), + ("signature", "账号签名", "TEXT",), + ("user_age", "年龄", "INTEGER",), + ("ip_label", "IP归属地", "TEXT",), + ("text", "评论内容", "TEXT",), + ("sticker", "评论表情", "TEXT",), + ("image", "评论图片", "TEXT",), + ("digg_count", "点赞数量", "INTEGER",), + ("reply_comment_total", "回复数量", "INTEGER",), + ("reply_id", "回复ID", "TEXT",), + ("reply_to_reply_id", "回复对象", "TEXT",), + ) + user = ( + ("collection_time", "采集时间", "TEXT",), + ("nickname", "昵称昵称", "TEXT",), + ("url", "账号链接", "TEXT",), + ("signature", "账号签名", "TEXT",), + ("unique_id", "抖音号", "TEXT",), + ("user_age", "年龄", "INTEGER",), + ("gender", "性别", "TEXT",), + ("country", "国家", "TEXT",), + ("province", "省份", "TEXT",), + ("city", "城市", "TEXT",), + ("district", "地区", "TEXT",), + ("ip_location", "IP归属地", "TEXT",), + ("verify", "标签", "TEXT",), + ("enterprise", "企业", "TEXT",), + ("sec_uid", "SEC_UID", "TEXT",), + ("uid", "UID", "TEXT",), + ("short_id", "SHORT_ID", "TEXT",), + ("avatar", "头像链接", "TEXT",), + ("cover", "背景图链接", "TEXT",), + ("aweme_count", "作品数量", "INTEGER",), + ("total_favorited", "获赞数量", "INTEGER",), + ("favoriting_count", "喜欢数量", "INTEGER",), + ("follower_count", "粉丝数量", "INTEGER",), + ("following_count", "关注数量", "INTEGER",), + ("max_follower_count", "粉丝最大值", "INTEGER",), + ) + user_tiktok = ( + ("collection_time", "采集时间", "TEXT",), + ("nickname", "昵称昵称", "TEXT",), + ("url", "账号链接", "TEXT",), + ("signature", "账号签名", "TEXT",), + ("uniqueId", "抖音号", "TEXT",), + ("avatarThumb", "缩略头像链接", "TEXT",), + ("avatarMedium", "中图头像链接", "TEXT",), + ("avatarLarger", "大图头像链接", "TEXT",), + ("secUid", "SEC_UID", "TEXT",), + ("link", "用户设置链接", "TEXT",), + ("risk", "链接风险", "INTEGER",), + ("videoCount", "作品数量", "INTEGER",), + ("heartCount", "获赞数量", "INTEGER",), + ("followerCount", "粉丝数量", "INTEGER",), + ("followingCount", "关注数量", "INTEGER",), + ("stitchSetting", "拼接设置", "INTEGER",), + ("privateAccount", "是否允许嵌入", "INTEGER",), + ("isEmbedBanned", "账号是否被禁止使用嵌入功能", "BOOL",), + ("isADVirtual", "是否为广告虚拟号", "BOOL",), + ("openFavorite", "是否开放喜欢列表", "BOOL",), + ("privateAccount", "是否为私有账户", "BOOL",), + ("canExpPlaylist", "是否允许导出播放列表", "BOOL",), + ("verified", "标签", "BOOL",), + ("ttSeller", "认证商家", "BOOL",), + ) + search_user = ( + ("collection_time", "采集时间", "TEXT",), + ("uid", "UID", "TEXT",), + ("sec_uid", "SEC_UID", "TEXT",), + ("nickname", "账号昵称", "TEXT",), + ("unique_id", "抖音号", "TEXT",), + ("short_id", "SHORT_ID", "TEXT",), + ("avatar", "头像链接", "TEXT",), + ("signature", "账号签名", "TEXT",), + ("verify", "标签", "TEXT",), + ("enterprise", "企业", "TEXT",), + ("follower_count", "粉丝数量", "INTEGER",), + ("total_favorited", "获赞数量", "INTEGER",), + ) + search_live = ( + ("collection_time", "采集时间", "TEXT",), + ("room_id", "直播ID", "TEXT",), + ("uid", "UID", "TEXT",), + ("sec_uid", "SEC_UID", "TEXT",), + ("nickname", "账号昵称", "TEXT",), + ("short_id", "SHORT_ID", "TEXT",), + ("avatar", "头像链接", "TEXT",), + ("signature", "账号签名", "TEXT",), + ("verify", "标签", "TEXT",), + ("enterprise", "企业", "TEXT",), + ) + hot = ( + ("position", "排名", "INTEGER",), + ("word", "内容", "TEXT",), + ("hot_value", "热度", "INTEGER",), + ("cover", "封面", "TEXT",), + ("event_time", "时间", "TEXT",), + ("view_count", "浏览数量", "INTEGER",), + ("video_count", "视频数量", "INTEGER",), + ("sentence_id", "SENTENCE_ID", "TEXT",), + ) + + detail_keys = [i[0] for i in detail] + detail_name = [i[1] for i in detail] + detail_type = [i[2] for i in detail] + comment_keys = [i[0] for i in comment] + comment_name = [i[1] for i in comment] + comment_type = [i[2] for i in comment] + user_keys = [i[0] for i in user] + user_name = [i[1] for i in user] + user_type = [i[2] for i in user] + user_tiktok_keys = [i[0] for i in user_tiktok] + user_tiktok_name = [i[1] for i in user_tiktok] + user_tiktok_type = [i[2] for i in user_tiktok] + search_user_keys = [i[0] for i in search_user] + search_user_name = [i[1] for i in search_user] + search_user_type = [i[2] for i in search_user] + search_live_keys = [i[0] for i in search_live] + search_live_name = [i[1] for i in search_live] + search_live_type = [i[2] for i in search_live] + hot_keys = [i[0] for i in hot] + hot_name = [i[1] for i in hot] + hot_type = [i[2] for i in hot] + + LoggerParams = { + "detail": { + "db_name": "DetailData.db", + "title_line": detail_name, + "title_type": detail_type, + "field_keys": detail_keys, + }, + "comment": { + "db_name": "CommentData.db", + "title_line": comment_name, + "title_type": comment_type, + "field_keys": comment_keys, + }, + "user": { + "db_name": "UserData.db", + "title_line": user_name, + "title_type": user_type, + "field_keys": user_keys, + }, + "user_tiktok":{ + "db_name": "UserTikTokData.db", + "title_line": user_tiktok_name, + "title_type": user_tiktok_type, + "field_keys": user_tiktok_keys, + }, + "mix": { + "db_name": "MixData.db", + "title_line": detail_name, + "title_type": detail_type, + "field_keys": detail_keys, + }, + "search_general": { + "db_name": "SearchData.db", + "title_line": detail_name, + "title_type": detail_type, + "field_keys": detail_keys, + }, + "search_user": { + "db_name": "SearchData.db", + "title_line": search_user_name, + "title_type": search_user_type, + "field_keys": search_user_keys, + }, + "search_live": { + "db_name": "SearchData.db", + "title_line": search_live_name, + "title_type": search_live_type, + "field_keys": search_live_keys, + }, + "hot": { + "db_name": "BoardData.db", + "title_line": hot_name, + "title_type": hot_type, + "field_keys": hot_keys, + }, + } + DataLogger = { + "csv": CSVLogger, + "xlsx": XLSXLogger, + "sql": SQLLogger, + # "mysql": BaseTextLogger, + } + + def run( + self, + parameter: "Parameter", + folder="", + type_="detail", + blank=False, + ): + root = parameter.root.joinpath( + parameter.CLEANER.filter_name(folder, "Data")) + root.mkdir(exist_ok=True) + params = self.LoggerParams[type_] + logger = BaseTextLogger if blank else self.DataLogger.get( + parameter.storage_format, BaseTextLogger) + return root, params, logger