Skip to content

Commit

Permalink
pre-gnews-pivot
Browse files Browse the repository at this point in the history
  • Loading branch information
dcolinmorgan committed Mar 20, 2024
1 parent c2c37ef commit 3dd305c
Show file tree
Hide file tree
Showing 8 changed files with 48,467 additions and 49,543 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/on_push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ jobs:
env:
OS_TOKEN: ${{ secrets.OS_TOKEN }}

- name: Minimal tests 2
run: |
source DOTS/bin/activate
python -m spacy download en_core_web_sm
python DOTS/dots_feat.py -n 5 -f 3 -s 1 -o dots_feats.csv
env:
OS_TOKEN: ${{ secrets.OS_TOKEN }}
# - name: Minimal tests 2
# run: |
# source DOTS/bin/activate
# python -m spacy download en_core_web_sm
# python DOTS/dots_feat.py -n 5 -f 3 -s 1 -o dots_feats.csv
# env:
# OS_TOKEN: ${{ secrets.OS_TOKEN }}
20,017 changes: 85 additions & 19,932 deletions DOTS/dots_feat.ipynb

Large diffs are not rendered by default.

58 changes: 46 additions & 12 deletions DOTS/dots_feat.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

# Setup argument parser
parser = argparse.ArgumentParser(description='Process OS data for dynamic features.')
parser.add_argument('-n', type=int, default=10000, help='Number of data items to get')
parser.add_argument('-n', type=int, default=1000, help='Number of data items to get')
parser.add_argument('-f', type=int, default=3, help='Number of features per item to get')
parser.add_argument('-o', type=str, default='dots_feats.csv', help='Output file name')
parser.add_argument('-p', type=int, default=1, help='Parallelize requests')
Expand Down Expand Up @@ -78,6 +78,10 @@ def get_massive_data(n=args.n):
query = {
"size": str(n),
"timeout": "10s",
"slice": {
"id": 0,
"max": 10
},
"query": {
"bool": {
"must": [
Expand All @@ -90,12 +94,8 @@ def get_massive_data(n=args.n):
scroll='1m',
body=query,
)
pagination_id = response["_scroll_id"]
response = client.scroll(
scroll='1m',
scroll_id=pagination_id
)
return response

return response, client


def process_hit(hit):
Expand Down Expand Up @@ -175,6 +175,34 @@ def process_data(hits,fast=args.p):
return articles


def process_response(response):
hits = response["hits"]["hits"]
output=[]
for hit in hits:
source = hit["_source"]
date = datetime.strptime(source['metadata']['GDELT_DATE'], "%Y%m%d%H%M%S")
date = formatted_date = date.strftime("%d-%m-%Y")
loc = source['metadata']['Locations']
loc = loc.replace("'", '"') # json requires double quotes for keys and string values
try:
list_of_dicts = json.loads(loc)
location_full_names = [dict['Location FullName'] for dict in list_of_dicts if 'Location FullName' in dict]
loc = location_full_names[0]
except:
loc = None
org = source['metadata']['Organizations']
per = source['metadata']['Persons']
theme = source['metadata']['Themes'].rsplit('_')[-1]
title = source['metadata']['page_title']
url = source['metadata']['DocumentIdentifier']
output.append([date, loc, title, org, per, theme, url])

pagination_id=response['_scroll_id']
return pagination_id, output




def chunk_text(text, max_len):
tokens = nltk.word_tokenize(text)
num_chunks = len(tokens) // max_len
Expand Down Expand Up @@ -233,17 +261,23 @@ def main(args):
data = get_data(args.n)
articles = process_data(data)
else:
response = get_massive_data(args.n)
response, client = get_massive_data(args.n)
pagination_id = response["_scroll_id"]
hits = response["hits"]["hits"]
articles=[]
while len(hits) != 0:
articles=[]
client = OpenSearch(os_url)
# try:
response = client.scroll(
scroll='1m',
scroll='5m',
scroll_id=pagination_id
)
articles.append(process_data(response))
hits = response["hits"]["hits"]
pagination_id, article = process_response(response)
articles.append(article)
# except:
# print("A ConnectionTimeout error occurred.")
# pass
articles = [item for sublist in articles for item in sublist]
rank_articles=[]
for i in tqdm(articles, desc="featurizing articles"):
foreparts=str(i).split(',')[:2] # location and date
Expand Down
39,290 changes: 39,290 additions & 0 deletions DOTS/input/big_report.csv

Large diffs are not rendered by default.

38,610 changes: 9,018 additions & 29,592 deletions DOTS/input/report.csv

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions DOTS/test/test_dots_feat.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,21 @@ def test_featurize_stories(data=get_data(5)):
assert len(features) == 4
except:
pass

def test_massive_featurize(data=get_massive_data()):
pagination_id = response["_scroll_id"]
hits = response["hits"]["hits"]
while len(hits) != 0:
articles=[]
client = OpenSearch(os_url)
response = client.scroll(
scroll='1m',
scroll_id=pagination_id
)
articles.append(process_data(response))
# assert len(articles) == 5
try: #since some stories will be unretreatable
features = featurize_stories(str(articles), 4, 512)
assert len(features) == 4
except:
pass
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# OpenSearch News Featurizer

Using the OpenSearch API, this tool pulls news stories and extracts features from the text. The features are then stored in a CSV file.

Using scroll and slice, the tool can now pull a large number of stories from OpenSearch.

Clone current version & run [dots_feat.py](https://github.com/dcolinmorgan/DOTS/blob/main/DOTS/dots_feat.py)
--------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
'scikit-learn',
'transformers',
'torch',
'opensearchpy'
'requests',
'nltk',
'numpy',
Expand Down

0 comments on commit 3dd305c

Please sign in to comment.