Skip to content

Commit

Permalink
Merge pull request #11 from dario-github/dario-github/issue9
Browse files Browse the repository at this point in the history
  • Loading branch information
dario-github authored Feb 21, 2023
2 parents 1db737c + 94a3376 commit 0c0052a
Show file tree
Hide file tree
Showing 10 changed files with 276 additions and 64 deletions.
8 changes: 7 additions & 1 deletion configs/config.test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,10 @@ tasks :
filter :
property : Tags
multi_select :
contains : unit_testing
contains : unit_testing
-
run : False
name : discarded_task # Custom name for differentiation of output file
describe : discarded unit testing task # Description of the current task, used to record what the task is to do
database_id : '119943c69f5d477dacb124a093e202ce' # database id
extra : {}
57 changes: 55 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[tool.poetry]
name = "notion-nlp"
version = "1.0.2"
version = "1.0.3"
description = "Reading rich text information from a Notion database and performing simple NLP analysis."
authors = ["dario-github <[email protected]>"]
authors = ["Dario Zhang <[email protected]>"]
license = "MIT license"
readme = "README.md"
packages = [{include = "notion_nlp", from = "src"}]
Expand All @@ -20,6 +20,7 @@ tabulate = "0.9.0"
wcwidth = "0.2.6"
requests = "2.28.2"
typer = "0.7.0"
pydantic = "1.10.5"

[tool.poetry.group.dev.dependencies]
py = "1.11.0"
Expand Down
2 changes: 1 addition & 1 deletion src/notion_nlp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "1.0.2"
__version__ = "1.0.3"
from . import core, parameter
from .__main__ import run_all_task, run_task, task_info
from .parameter import log
11 changes: 5 additions & 6 deletions src/notion_nlp/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def run_task(
download_stopwords: bool = False,
stopfiles_dir: str = (PROJECT_ROOT_DIR / "resources/stopwords").as_posix(),
stopfiles_postfix: str = "stopwords.txt",
top_n: int = 10,
top_n: int = 5,
output_dir: str = (PROJECT_ROOT_DIR / "results").as_posix(),
):
"""运行单个任务,任务字典或任务名必须传入一个
Expand All @@ -66,13 +66,15 @@ def run_task(
download_stopwords (bool, optional): 是否下载停用词. Defaults to False.
stopfiles_dir (str, optional): 停用词文件目录. Defaults to "notion_nlp/stopwords".
stopfiles_postfix (str, optional): 停用词文件后缀. Defaults to "stopwords.txt".
top_n (int, optional): 返回top_n的结果. Defaults to 10.
top_n (int, optional): 返回top_n的结果. Defaults to 5.
output_dir (str, optional): 输出目录. Defaults to "notion_nlp/results".
Raises:
ConfigError: 检查任务信息字典和任务名是否存在
TaskError: 检查任务是否存在或禁用
"""
if top_n < 1:
raise ValueError("top_n must be a positive integer")
# 以下的操作都是为了获取两个参数:notion_header和task参数类
# 如果config文件存在,可以不用提供token,只需task_name或task其中之一即可
if Path(config_file).exists():
Expand Down Expand Up @@ -161,10 +163,7 @@ def run_all_task(
for task in config.tasks:
if not task.run:
continue
run_task(
task=task,
token=config.notion.token,
)
run_task(task=task, token=config.notion.token)


if __name__ == "__main__":
Expand Down
3 changes: 2 additions & 1 deletion src/notion_nlp/core/api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import logging
import ssl
from typing import List

import arrow
Expand Down Expand Up @@ -87,7 +88,7 @@ def read_blocks(self, pages: List):
url=f"https://api.notion.com/v1/blocks/{page_id}/children",
headers=self.header,
)
except Exception as e:
except ssl.SSLEOFError as e:
logging.error(
f"read blocks failed, page id: {page_id}, origin error info: {e}"
)
Expand Down
37 changes: 18 additions & 19 deletions src/notion_nlp/core/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,8 @@ def handling_sentences(self, stopwords: set, split_pkg: str):
word for word in sent if not self.check_stopwords(word, stopwords)
]
)

# 检查序列是否为空
if not self.sequence:
if not any(self.sequence):
logging.error(
f"该任务未获取到符合条件的文本,请检查停用词。database ID: {self.database_id}; extra data: {self.extra_data}"
)
Expand Down Expand Up @@ -324,25 +323,25 @@ def by_sum(df: pd.DataFrame):
return df.sum(axis=0).sort_values(ascending=False)


def computeTF(wordDict, bagOfWords):
tfDict = {}
bagOfWordsCount = len(bagOfWords)
for word, count in wordDict.items():
tfDict[word] = count / float(bagOfWordsCount)
return tfDict
# def computeTF(wordDict, bagOfWords):
# tfDict = {}
# bagOfWordsCount = len(bagOfWords)
# for word, count in wordDict.items():
# tfDict[word] = count / float(bagOfWordsCount)
# return tfDict


def computeIDF(documents):
import math
# def computeIDF(documents):
# import math

N = len(documents)
# N = len(documents)

idfDict = dict.fromkeys(documents[0].keys(), 0)
for document in documents:
for word, val in document.items():
if val > 0:
idfDict[word] += 1
# idfDict = dict.fromkeys(documents[0].keys(), 0)
# for document in documents:
# for word, val in document.items():
# if val > 0:
# idfDict[word] += 1

for word, val in idfDict.items():
idfDict[word] = math.log(N / float(val))
return idfDict
# for word, val in idfDict.items():
# idfDict[word] = math.log(N / float(val))
# return idfDict
2 changes: 1 addition & 1 deletion src/notion_nlp/parameter/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def to_dict(self):
}


class RequestParams:
class ConfigParams:
def __init__(self, token, tasks: List[TaskParams]):
self.notion: NotionParams = NotionParams(token)
self.tasks: List[TaskParams] = self.process_task_name(tasks)
Expand Down
8 changes: 4 additions & 4 deletions src/notion_nlp/parameter/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from typing import Iterator

from notion_nlp.parameter.config import RequestParams, TaskParams
from notion_nlp.parameter.config import ConfigParams, TaskParams


def load_stopwords(stopfiles: Iterator):
Expand Down Expand Up @@ -33,14 +33,14 @@ def load_stopwords(stopfiles: Iterator):
return stopwords


def load_config(config_file: str = "configs/config.yaml") -> RequestParams:
def load_config(config_file: str = "configs/config.yaml") -> ConfigParams:
"""从配置文件加载参数类
Args:
config_file (str, optional): 参数文件地址. Defaults to "configs/config.yaml".
Returns:
RequestParams: 包含所有用于request信息的参数类
ConfigParams: 包含所有用于request信息的参数类
"""
from ruamel.yaml import YAML

Expand All @@ -58,7 +58,7 @@ def join(loader, node):
with open(config_file, "r", encoding="utf-8") as f:
config = yaml.load(f)
tasks = [TaskParams(**task) for task in config["tasks"]]
config = RequestParams(config["notion"]["token"], tasks)
config = ConfigParams(config["notion"]["token"], tasks)
return config


Expand Down
Loading

0 comments on commit 0c0052a

Please sign in to comment.