Skip to content

Commit

Permalink
restructure folders
Browse files Browse the repository at this point in the history
  • Loading branch information
dcolinmorgan committed Mar 28, 2024
1 parent 5947d39 commit ab15c39
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 93 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/daily.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ jobs:
run: |
source dots/bin/activate
python -m spacy download en_core_web_sm
python -m DOTS/feat.py -d 3
# python DOTS/feat.py -d 3
python -m main -d 3
env:
OS_TOKEN: ${{ secrets.OS_TOKEN }}
LOBSTR_KEY: ${{ secrets.LOBSTR_KEY }}
185 changes: 93 additions & 92 deletions DOTS/feat.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,52 @@
# conda create -n DT ipykernel
# python -m ipykernel install --user --name DT
# pip install torch bs4 transformers spacy numpy pandas scikit-learn scipy nltk
import argparse, signal
from tqdm import tqdm
from datetime import datetime
import numpy as np, pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoModel, AutoTokenizer
import torch, spacy,nltk,subprocess, json, requests,string,csv,logging,os
# import argparse
# from tqdm import tqdm
# from datetime import datetime
# import numpy as np, pandas as pd
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import CountVectorizer
# from transformers import AutoModel, AutoTokenizer
# import torch, spacy,nltk,subprocess, json, requests,string,csv,logging,os

from .scrape import get_OS_data, get_massive_OS_data, get_google_news, scrape_lobstr # need .scrape and .pull for production
from .pull import process_hit, process_data, pull_data, pull_lobstr_gdoc
# from .scrape import get_OS_data, get_massive_OS_data, get_google_news, scrape_lobstr # need .scrape and .pull for production
# from .pull import process_hit, process_data, pull_data, pull_lobstr_gdoc

try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
# try:
# nltk.data.find('tokenizers/punkt')
# except LookupError:
# nltk.download('punkt')


# Setup logging
logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
# logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

# Setup argument parser
parser = argparse.ArgumentParser(description='Process OS data for dynamic features.')
# parser.add_argument('-n', type=int, default=100, help='Number of data items to get')
parser.add_argument('-f', type=int, default=3, help='Number of features per item to get')
parser.add_argument('-o', type=str, default='dots_feats.csv', help='Output file name')
# parser.add_argument('-p', type=int, default=1, help='Parallelize requests')
# parser.add_argument('-t', type=int, default=1, help='Scroll Timeout in minutes, if using "d=1" large data set')
parser.add_argument('-d', type=int, default=1, help='0 for a small amount, 1 for large, 2 for google news, 3 for lobstr')
# parser.add_argument('-e', type=datetime, default=20231231, help='end date')
args, unknown = parser.parse_known_args()
# # Setup argument parser
# parser = argparse.ArgumentParser(description='Process OS data for dynamic features.')
# # parser.add_argument('-n', type=int, default=100, help='Number of data items to get')
# parser.add_argument('-f', type=int, default=3, help='Number of features per item to get')
# parser.add_argument('-o', type=str, default='dots_feats.csv', help='Output file name')
# # parser.add_argument('-p', type=int, default=1, help='Parallelize requests')
# # parser.add_argument('-t', type=int, default=1, help='Scroll Timeout in minutes, if using "d=1" large data set')
# parser.add_argument('-d', type=int, default=1, help='0 for a small amount, 1 for large, 2 for google news, 3 for lobstr')
# # parser.add_argument('-e', type=datetime, default=20231231, help='end date')
# args, unknown = parser.parse_known_args()

# Load models and tokenizers
model_name = "distilroberta-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# !python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

# Define constants
n_gram_range = (1, 2)
stop_words = "english"
embeddings=[]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# # Load models and tokenizers
# model_name = "distilroberta-base"
# model = AutoModel.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# # !python -m spacy download en_core_web_sm
# nlp = spacy.load('en_core_web_sm')

# # Define constants
# n_gram_range = (1, 2)
# stop_words = "english"
# embeddings=[]
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# Define functions
def chunk_text(text, max_len):
Expand Down Expand Up @@ -106,60 +107,60 @@ def featurize_stories(text, top_k, max_len):
return [candidates[index] for index in distances.argsort()[0][::-1][-top_k:]]


# Main pipeline
def main(args):
if args.d == 0:
data = get_OS_data(args.n)
articles = process_data(data)
# articles = process_response(data)
dname = 'small0_'
elif args.d == 1:
response, client = get_massive_OS_data(args.t)
pagination_id = response["_scroll_id"]
hits = response["hits"]["hits"]
articles = []
while len(hits) != 0 and len(articles2) < args.n:
response = client.scroll(
scroll=str(args.t)+'m',
scroll_id=pagination_id
)
hits = response["hits"]["hits"]
# article = process_data(response)
articles.append(hits)
articles2 = [item for sublist in articles for item in sublist]
articles = [item for sublist in articles for item in sublist]
dname = 'large1_'
elif args.d == 2:
articles = get_google_news('disaster')
dname = 'google2_'
elif args.d == 3:
articles = pull_lobstr_gdoc()
dname = 'lobstr3_'
rank_articles = []
if device == 'cuda':
dataloader = DataLoader(data['text'], batch_size=1, shuffle=True, num_workers=4)
RR = dataloader
else:
RR = articles
for j,i in tqdm(enumerate(RR), total=len(RR), desc="featurizing articles"):
# for i in tqdm(articles, desc="featurizing articles"):
try:
foreparts = str(i).split(',')[:2] # location and date
except:
foreparts=None
# meat="".join(str(j).split(',')[2:-3]) # text
try:
cc=featurize_stories(str(i), top_k = args.f, max_len=512)
rank_articles.append([foreparts,cc])
with open('DOTS/output/'+dname+args.o, 'a', newline='') as file:
writer = csv.writer(file)
writer.writerows([cc])
except Exception as e:
logging.error(f"Failed to process article: {e}")
# # Main pipeline
# def main(args):
# if args.d == 0:
# data = get_OS_data(args.n)
# articles = process_data(data)
# # articles = process_response(data)
# dname = 'small0_'
# elif args.d == 1:
# response, client = get_massive_OS_data(args.t)
# pagination_id = response["_scroll_id"]
# hits = response["hits"]["hits"]
# articles = []
# while len(hits) != 0 and len(articles2) < args.n:
# response = client.scroll(
# scroll=str(args.t)+'m',
# scroll_id=pagination_id
# )
# hits = response["hits"]["hits"]
# # article = process_data(response)
# articles.append(hits)
# articles2 = [item for sublist in articles for item in sublist]
# articles = [item for sublist in articles for item in sublist]
# dname = 'large1_'
# elif args.d == 2:
# articles = get_google_news('disaster')
# dname = 'google2_'
# elif args.d == 3:
# articles = pull_lobstr_gdoc()
# dname = 'lobstr3_'
# rank_articles = []
# if device == 'cuda':
# dataloader = DataLoader(data['text'], batch_size=1, shuffle=True, num_workers=4)
# RR = dataloader
# else:
# RR = articles
# for j,i in tqdm(enumerate(RR), total=len(RR), desc="featurizing articles"):
# # for i in tqdm(articles, desc="featurizing articles"):
# try:
# foreparts = str(i).split(',')[:2] # location and date
# except:
# foreparts=None
# # meat="".join(str(j).split(',')[2:-3]) # text
# try:
# cc=featurize_stories(str(i), top_k = args.f, max_len=512)
# rank_articles.append([foreparts,cc])
# with open('DOTS/output/'+dname+args.o, 'a', newline='') as file:
# writer = csv.writer(file)
# writer.writerows([cc])
# except Exception as e:
# logging.error(f"Failed to process article: {e}")

with open('DOTS/output/full_'+dname+args.o, 'a', newline='') as file:
writer = csv.writer(file)
writer.writerows(rank_articles)
# with open('DOTS/output/full_'+dname+args.o, 'a', newline='') as file:
# writer = csv.writer(file)
# writer.writerows(rank_articles)

if __name__ == "__main__":
main(args)
# if __name__ == "__main__":
# main(args)
Empty file added __init__.py
Empty file.
108 changes: 108 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import argparse
from tqdm import tqdm
from datetime import datetime
import numpy as np, pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoModel, AutoTokenizer
import torch, spacy,nltk,subprocess, json, requests,string,csv,logging,os

try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')


from DOTS.feat import chunk_text, featurize_stories
from DOTS.scrape import get_OS_data, get_google_news, get_massive_OS_data, get_npr_news, scrape_lobstr
from DOTS.pull import process_hit, process_data, pull_data, process_response, pull_lobstr_gdoc

logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')


def _input():
# Setup argument parser
parser = argparse.ArgumentParser(description='Process OS data for dynamic features.')
# parser.add_argument('-n', type=int, default=100, help='Number of data items to get')
parser.add_argument('-f', type=int, default=3, help='Number of features per item to get')
parser.add_argument('-o', type=str, default='dots_feats.csv', help='Output file name')
# parser.add_argument('-p', type=int, default=1, help='Parallelize requests')
# parser.add_argument('-t', type=int, default=1, help='Scroll Timeout in minutes, if using "d=1" large data set')
parser.add_argument('-d', type=int, default=3, help='0 for a small amount, 1 for large, 2 for google news, 3 for lobstr')
# parser.add_argument('-e', type=datetime, default=20231231, help='end date')
args, unknown = parser.parse_known_args()
return args

# Load models and tokenizers
model_name = "distilroberta-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# !python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

# Define constants
n_gram_range = (1, 2)
stop_words = "english"
embeddings=[]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

args = _input()

# Main pipeline
def main(args):
if args.d == 0:
data = get_OS_data(args.n)
articles = process_data(data)
# articles = process_response(data)
dname = 'small0_'
elif args.d == 1:
response, client = get_massive_OS_data(args.t)
pagination_id = response["_scroll_id"]
hits = response["hits"]["hits"]
articles = []
while len(hits) != 0 and len(articles2) < args.n:
response = client.scroll(
scroll=str(args.t)+'m',
scroll_id=pagination_id
)
hits = response["hits"]["hits"]
# article = process_data(response)
articles.append(hits)
articles2 = [item for sublist in articles for item in sublist]
articles = [item for sublist in articles for item in sublist]
dname = 'large1_'
elif args.d == 2:
articles = get_google_news('disaster')
dname = 'google2_'
elif args.d == 3:
articles = pull_lobstr_gdoc()
dname = 'lobstr3_'
rank_articles = []
if device == 'cuda':
dataloader = DataLoader(data['text'], batch_size=1, shuffle=True, num_workers=4)
RR = dataloader
else:
RR = articles
for j,i in tqdm(enumerate(RR), total=len(RR), desc="featurizing articles"):
# for i in tqdm(articles, desc="featurizing articles"):
try:
foreparts = str(i).split(',')[:2] # location and date
except:
foreparts=None
# meat="".join(str(j).split(',')[2:-3]) # text
try:
cc=featurize_stories(str(i), top_k = args.f, max_len=512)
rank_articles.append([foreparts,cc])
with open('DOTS/output/'+dname+args.o, 'a', newline='') as file:
writer = csv.writer(file)
writer.writerows([cc])
except Exception as e:
logging.error(f"Failed to process article: {e}")

with open('DOTS/output/full_'+dname+args.o, 'a', newline='') as file:
writer = csv.writer(file)
writer.writerows(rank_articles)

if __name__ == "__main__":
main(args)

0 comments on commit ab15c39

Please sign in to comment.