Skip to content

Commit

Permalink
formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
MisterXY89 committed Dec 8, 2023
1 parent 33bcc72 commit 3109d73
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions chat_doc/dataset_generation/diagnose_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pandas as pd
from tqdm import tqdm

from chat_doc.config import BASE_DIR, logger, SEED
from chat_doc.config import BASE_DIR, SEED, logger
from chat_doc.dataset_generation.chat_dataset import ChatDataset

np.random.seed(SEED)
Expand Down Expand Up @@ -43,10 +43,11 @@ def process_data(self):

for col in diagnose_data.columns:
# remove all urls from text
diagnose_data[col] = diagnose_data[col].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()
diagnose_data[col] = (
diagnose_data[col].str.replace(r"\s*https?://\S+(\s+|$)", " ").str.strip()
)
# remove all html tags from text
diagnose_data[col] = diagnose_data[col].str.replace(r'<[^<]+?>', ' ').str.strip()

diagnose_data[col] = diagnose_data[col].str.replace(r"<[^<]+?>", " ").str.strip()

logger.info("Diagnose-Me data processed.")
self.processed = True
Expand All @@ -60,8 +61,7 @@ def build_prompts(self):
diagnose_data = diagnose_data.reset_index(drop=True)

prompts = []
for _, row in tqdm(diagnose_data.iterrows(), total=diagnose_data.shape[0]):

for _, row in tqdm(diagnose_data.iterrows(), total=diagnose_data.shape[0]):
prompts.append(
# inherit from ChatDataset
self.unify_prompt(
Expand Down

0 comments on commit 3109d73

Please sign in to comment.