From 490bf9bb71e86c44f3c88fc286b8b15994a09f03 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 29 Mar 2024 17:06:14 +0700 Subject: [PATCH] dbscan clusters on story features --- .github/workflows/daily.yml | 2 +- DOTS/output/full_small0_dots_feats.csv | 80 +++++++++++ DOTS/output/small0_dots_feats.csv | 80 +++++++++++ DOTS/output/test_gnews_dots_feats.csv | 2 + DOTS/pull.py | 11 +- DOTS/scrape.py | 126 +++++------------ demo/dots_feat.ipynb | 180 ++++++++++++------------- main.py | 48 ++++--- 8 files changed, 303 insertions(+), 226 deletions(-) create mode 100644 DOTS/output/full_small0_dots_feats.csv create mode 100644 DOTS/output/test_gnews_dots_feats.csv diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index 15f5f6c..281a087 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -47,7 +47,7 @@ jobs: run: | source dots/bin/activate python -m spacy download en_core_web_sm - python -m main -d 0 -n 100 -f 10 + python -m main -d 4 -n 200 -f 10 env: OS_TOKEN: ${{ secrets.OS_TOKEN }} LOBSTR_KEY: ${{ secrets.LOBSTR_KEY }} diff --git a/DOTS/output/full_small0_dots_feats.csv b/DOTS/output/full_small0_dots_feats.csv new file mode 100644 index 0000000..f593010 --- /dev/null +++ b/DOTS/output/full_small0_dots_feats.csv @@ -0,0 +1,80 @@ +"[""['Palestinians walk through destruction by the Israeli bombardment in the Nusseirat refugee camp in Gaza Strip"", ' Tuesday']","['famine', 'trailblazers', 'camps', 'bases', 'egypt', 'evacuation', 'cease', 'rebels', 'widespread famine', 'disease']" +"[""['Alessandro Tortora / Shutterstock.com'"", "" 'Italian archaeologists revealed a “monumental” Roman villa in early January""]","['waste', 'cataclysms', 'disasters', 'eruption', 'valorizzazione', 'mount vesuvius', 'ancient cataclysm', 'mariano nuzzo', 'costruzione', 'beach']" +"[""['\\nThe police in St Catherine have now identified the licensed firearm holder who was shot dead at a cock fight in bushes in Old Harbour on Sunday.'"", "" '\\n\\tHe is 56-year-old\\xa0Devon Gallimore of'""]","['district', 'discussion', 'firearm', 'clarendon', 'old harbour', 'sunday', 'new bowen', 'whatsapp', 'st catherine', 'fight']" +"[""['For premium support please call:'"", "" 'Snow shovels have been gathering dust since 2022 in many cities along the Interstate 95 corridor""]","['700 days', 'cancellations', 'travel delays', 'school cancellations', 'streak', 'snow shovels', 'meteorologists', 'snowstorm', 'accuweather alerts', 'accuweather meteorologists']" +"[""['NEW BEDFORD (WBSM) — New Bedford Police arrested a juvenile for carrying a firearm in the city’s South End over the weekend.'"", "" 'On Saturday""]","['vehicle', 'juvenile', 'detectives', 'wbsm', 'kate robinson', 'saturday', 'traffic', 'ariel dorsey', 'massachusetts law', 'conflicting statements']" +"[""['Fire crews have dealt with a blaze that has caused extensive damage to a house in North Clare this afternoon.'"", "" 'The alarm was raised at around 3.30pm when emergency services were alerted to fire at a domestic property at Newtown""]","['extensive damage', 'medical attention', 'breathing apparatus', 'emergency', 'damage', 'afternoon', 'breathing', 'rescue service', 'emergency services', 'firefighters']" +"[""['A preliminary 3.4 magnitude earthquake struck near Gilroy Tuesday morning"", "" according to the United States Geological Survey.'""]","['checklists', 'northeast', 'miles', 'magnitude', 'morning', 'preparedness', 'watsonville', 'earthquake', 'disaster', 'quake']" +"[""['Millions of Syrians face hunger with the suspension of all in-kind WFP food aid this month"", ' in part due to major cuts to US funding. US aid cuts of up to 50 percent are expected across all humanitarian sectors in 2024']","['saleem algerk', 'bread', 'early recovery', 'camps', 'pumping', 'cholera cases', 'neighboring countries', 'symptom', 'cholera', 'mohamad katoub']" +"[""['Lava from a volcanic eruption in southwestern Iceland has streamed into a nearby town"", "" engulfing homes and forcing the evacuation of local residents.\\n'""]","['volcano', 'grindavík', 'volcanoes', 'lava flows', 'eruption', 'earthquakes', 'evacuation', 'jóhannesson', 'eruptions', 'lúðvík pétursson']" +"[""['Get local news delivered to your inbox!'"", ' \'Lin Guyton says these photos shows how much better the horses are getting at Epona Horse Rescue.\\xa0Guyton learned ""out of the blue"" that 37 horses are going to be sold at auction on Saturday. Two others were leased and not part of the auction.\'']","['glaesemann', '30th street', '39 horses', 'horses', 'farnam street', 'jose galeno', 'federal courthouse', 'winter storms', 'confiscated horses', 'heavy snow']" +"[""['\\n This material may not be published"", ' broadcast']","['reward', 'detective', 'victim', 'washington', 'decision', 'perpetrator', 'conviction', 'michael dorgan', 'minutes', 'southern border']" +"[""['On the night of Thursday 11-Friday 12 January the US and UK militaries bombed sites in Yemen controlled by the Houthi movement. The stated aim of the action"", ' according to a UK spokesperson']","['shiite muslims', 'helicopter', 'revenge', 'israeli jews', 'women workers', 'bases', 'struggle', 'civilian ships', 'cease', 'ceasefire']" +"[""['(Catch all the US News"", ' UK News']","['stabbings', 'collapse', 'hard work', 'garcetti', 'volatile times', 'buy voltas', 'diktat', 'sonam wangchuk', 'poonawalla', 'underperformer']" +"[""['PORTLAND"", ' Ore. (KPTV/Gray News) - A Gresham']","['emotions', 'rain returns', 'couple', 'adrenaline', 'kitchen', 'swaying', 'neighbors', 'cason wolcott', 'mixed emotions', 'philip wolcott']" +"[""['An appropriate representation of the requested resource could not be found on this server. This error was generated by Mod_Security.']""]","['resource', 'representation', 'server', 'mod_security', 'error']" +"[""['Plentiful sunshine. High 47F. Winds NW at 10 to 15 mph..\\r\\n '"", "" 'Partly cloudy. Low 33F. Winds light and variable.'""]","['shifa hospital', 'winds nw', 'khan yunis', 'medwish', 'weeks', 'mph', 'israeli soldiers', 'ahmed kahlot', 'nominations', 'israeli interrogators']" +"[""['A man in the town of Naples was shot by police after an armed confrontation.'"", ' ""The incident began after the Cumberland County Sheriff\'s Office received a phone call that was abruptly cut off when the caller hung up. The dispatcher called back and spoke with Eric Sweda']","['injuries', 'comments', 'eric sweda', 'sweda', 'standard procedure', 'confrontation', 'dispatcher', 'dispatchers', 'meagan drillinger', 'rachel cavanaugh']" +"[""['Topics you care about"", "" straight to your inbox'""]","['inbox', 'llc']" +"[""['Share'"", "" 'ORLANDO""]","['material', 'occupants', 'block', 'day', 'cause', 'home', 'news', 'vehicle', 'casasia drive', 'orlando']" +"[""['Tahoe Fire and Fuels Team members including North Lake Tahoe"", ' North Tahoe and Tahoe Douglas fire protection districts']","['surveillance', 'temperatures', 'wildfires', 'laffy taffy', 'surrounding counties', 'cooler temperatures', 'unwanted wildfires', 'gusty winds', 'disease', 'golden eagles']" +"[""['FREDERICKTOWN"", ' Mo. (KFVS) - Crews battled a fire at a home in Fredericktown on Monday evening']","['scene', 'home', 'evening', 'water', 'injuries', 'fredericktown', 'monday evening', 'highway oo', 'firefighters', 'temperatures']" +"[""['CROGHAN"", ' New York (WWNY) - A 64 year old Croghan man died in a fire in his garage Monday']","['investigation', 'lyndaker', 'structure', 'rights', 'cause', 'department', 'flames', 'home', 'deputies', 'firefighters']" +"['[""\\n\\t\\t\\tThe content you\'re looking for is no longer available.\\n\\t\\t""', "" 'Hearst Television participates in various affiliate marketing programs""]","['programs', 'content', 'commissions', 'affiliate', 'retailer', 'products', 'links', 'retailer sites', 'marketing', 'hearst television']" +"[""['\\n This is the second volcanic eruption near Grindavík in one month"", "" as the Svartsengi volcanic system has re-awakened.\\n'""]","['fishing', 'eruptive activity', 'atmosphere', 'swarm', 'eruption', 'earthquakes', 'machines', 'enormous clouds', 'keflavík airport', 'sundhnúksgígar']" +"[""['Louth Live'"", "" 'https://www.dundalkdemocrat.ie/section/1234/advertise-with-us'""]","['temperatures', 'frost', 'drizzle', 'west winds', 'cold weather', 'highest temperatures', 'lowest temperatures', 'widespread frost', 'freezing fog', 'west breezes']" +"[""['The latest breaking updates"", "" delivered straight to your email inbox.'""]","['sister station', 'monday', 'kcra', 'california', 'sister', 'metal', 'overhang', 'damage', 'hearst television', 'pumps']" +"[""[' Sign In'"", "" ' Subscribe Now'""]","['snowfall', 'preparedness', 'vancouver island', 'afternoon', 'greater victoria', 'winter conditions', 'extreme weather', 'freezing rain', 'heavy snow', 'widespread snow']" +"[""['\\nKOKA The Heart of Gospel is a part of Alpha Media LLC.© 2024 Alpha Media LLC. All Rights Reserved. ']""]",['gospel'] +"[""['\\nLBC\\n'"", "" '\\n\\n \\n \\n Matthew Wright\\n \\n \\n \\n7am - 10am\\n'""]","['famine', 'heavy fighting', 'matthew wright', 'bases', 'rebels', 'israeli airstrikes', 'evacuation', 'baghdad', 'widespread famine', 'disease']" +"[""['A House vote planned for Tuesday night on a stopgap spending bill aimed at keeping the federal government funded until March is on a snow delay until Wednesday"", "" narrowing the time needed to pass before a potential partial government shutdown that could start this weekend.\\xa0'""]","['opportunity', 'wednesday', 'newsmax', 'weather', 'votes', 'march', 'night', 'tuesday night', 'vote', 'shutdown']" +"['[\'A bomb placed outside a home exploded. The Hudson Valley town was recently named one of New York\\\'s 25 ""safest"" places to live.\'', "" 'Police responded to a home in the Lower Hudson Valley following reports of an explosion.'""]","['hometowns', 'new york', 'accomplice', 'damage', 'morning', 'stanivukovic', 'mclain street', 'federal charges', 'explosion', 'dispute']" +"[""['This website is using a security service to protect itself from online attacks. The action you just performed triggered the security solution. There are several actions that could trigger this block including submitting a certain word or phrase"", "" a SQL command or malformed data.'""]","['block', 'command', 'solution', 'page', 'phrase', 'word', 'ray id', 'attacks', 'malformed data', 'online attacks']" +"[""['Offaly Live'"", "" 'https://www.offalyexpress.ie/section/1236/advertise-with-us'""]","['search', 'whitelist', 'rescue service', 'secondary schools', 'rescue services', 'firefighters', 'dublin castle', 'afternoon', 'near miss', 'evacuation']" +"[""['This page either does not exist or is currently unavailable.'"", ' \'From here you can either hit the ""back"" button on your browser to return to the previous page']","['site', 'button', 'page', 'browser']" +"[""['Next up in 5'"", "" 'Example video title will go here for this video'""]","['texas', 'calls', 'witnesses', 'debris', 'restaurant', 'conversation', 'fatalities', 'floor', 'atmos energy', 'explosion']" +"[""['Mostly Cloudy'"", "" '44'""]","['flooding', 'flood advisory', 'mph', '40 mph', '35 mph', 'daytime highs', 'gusty winds', 'heavy snow', 'widespread snow', 'blowing snow']" +"[""['Credit Cards'"", "" 'Loans'""]","['alaska airlines', 'cancellations', 'differences', 'flight disruption', 'tax brackets', 'wind chills', 'reagan airport', 'degrees', 'winter weather', 'severe weather']" +"[""['Partly Cloudy'"", "" '49'""]","['fri', 'redding', 'thu', 'vehicle', 'interactive radar', 'ashley gardner', 'completion', 'thursday', 'friday', 'firefighters']" +"[""[''"", "" ''""]","['parks', 'opportunities', 'envisionwise technology', 'destination', 'trees', 'sledding', 'vincent moore', 'windstorm', 'gates', 'willamette week']" +"[""[' Sign In'"", "" ' Subscribe Now'""]","['boats', 'blaze', 'coast guard', 'large flames', 'leathead road', 'visible flame', 'emergency', 'firefighters', 'emergency crews', 'okanagan lake']" +"[""['Share'"", ' ""Delays and cancelations are creating chaos for travelers going through Dulles International and Reagan National airports. The airports are dealing with their first significant storm in years and that’s meant a lot of headaches for passengers both Monday and Tuesday. FOX 5\'s Tom Fitzgerald has all the details.""']","['half', 'headaches', 'headache', 'chaos', 'dulles international', 'bare pavement', 'victims', 'husbands', 'search', 'reagan national']" +"['[\'\\n\\n On Air:\\n \\n\\n\\n The Sacred Circle ""TEMENOS"" \\n\\n\'', "" ''""]","['protesters', 'hanukkah', 'december', 'los angeles', 'fm corvallis', 'ceremony', 'jewish voice', 'jewish portlanders', 'ceasefire', 'minneapolis']" +"[""['A Riverland thunderstorm saw emergency services have a busy afternoon yesterday.'"", "" 'The Barmera & Berri regions were seen to cop the heaviest of the storm conditions""]","['wine', 'trees', 'services', 'industry body', 'rainfall', 'emergency', 'grape', 'emergency services', 'afternoon', 'thunderstorm']" +"[""['MONTGOMERY"", "" Ala. (WSFA) - A man who was seriously injured in an early Tuesday morning house fire in Montgomery has died.'""]","['death', 'smoke', 'result', 'victim', 'montgomery', 'morning', 'truck', 'search', 'rescue', 'faster']" +"[""['LOGAN — A family was displaced after their home was badly damaged in a fire Tuesday morning. The blaze was reported just after 9:30 a.m."", ' Jan. 16']","['damage', 'smoke damage', 'morning', 'heating', 'lamp', 'emergency', 'traffic', 'required fields', 'firefighters', 'nominee']" +"[""['An earthquake hit south Santa Cruz County Tuesday morning. '"", "" 'According to the United States Geological Survey""]","['retailer sites', 'aromas', 'miles', 'magnitude', 'hearst television', 'damage', 'morning', 'earthquake', 'quake', 'watsonville']" +"[""['Jon Gambrell'"", "" 'Associated Press'""]","['baluchistan', 'hosting insurgents', 'islamic state', 'provincial resources', 'baluch nationalists', 'baluchistan province', 'behest', 'insurgency', 'bases', 'switzerland']" +"[""['BETTENDORF"", ' Iowa (KWQC) - The Bettendorf Fire Department responded Tuesday afternoon to a reported fire at Sivyer Steel Castings']","['officials', 'kwqc', 'rights', 'scene', 'iowa', 'sivyer steel', 'plant', 'september', 'trucks', 'afternoon']" +"[""['Sign In'"", "" 'Register'""]","['evacuation', 'israeli forces', 'rebels', 'bases', 'israeli airstrikes', 'cease', 'baghdad', 'egypt', 'widespread famine', 'disease']" +"[""['Cloudy'"", "" '44'""]","['saturday', 'court documents', 'february', 'robbers', 'red sneakers', 'robbery', 'kevin schuster', 'surveillance', 'rensselaer', 'rensselaer man']" +"[""['PORTLAND"", "" Ore. (KOIN) — Freezing rain is already starting to push into parts of Oregon and ice will accumulate through Tuesday night and into Wednesday morning.'""]","['nfl evaluators', 'freezing', 'crystal', 'windy conditions', 'freezing surfaces', 'betting odds', 'temperatures', 'winter weather', 'orlando', 'freezing rain']" +"[""['Winter storms have delivered significant snow to the Northern and Central Sierra Nevada"", "" though the snowpack across the state remains below average for this time of year.'""]","['prevention', 'weeks', 'snowpack conditions', 'record rainfall', 'princeton university', 'damaging storm', 'winter storms', 'additional snow', 'significant snow', 'wet snow']" +"[""['Plus'"", "" 'You are not permitted to download""]","['blaze', 'damage', 'kaitangata', 'coal dust', 'embers', 'roofing iron', 'morning', 'parents', 'january', 'firefighters']" +"[""['Partly Cloudy'"", "" '38'""]","['smoke', 'wkrc', 'result', 'tuesday', 'interactive radar', 'fires', 'critical condition', 'pleasant ridge', 'floor', 'medicine']" +"[""['SHERMAN"", ' Texas (KXII) - For the past 18 months']","['case', 'suspicious pasture', 'fires', 'john weda', 'weda', 'ranches', '100 yards', 'grass', 'hay bales', 'circumstances']" +"['[""\\nIt\'s believed the incident was quickly brought under control', ' with the building reported to be undamaged.\\n""']","['news', 'gardaí', 'ukraine', 'war', 'services', 'families', 'lanesboro convent', 'emergency', 'emergency services', 'refugees']" +"[""['\\n'"", "" '\\n\\t\\t\\tJanuary 16""]","['kilometres', 'centimetres', '10 centimetres', 'greater victoria', 'rain forecasts', 'wind chill', '25 centimetres', 'snowstorm', 'winter storm', 'saskatchewan']" +"['[""If you have any queries about this error', ' try emailing feedback@mirror.co.uk and we\'ll do what we can to help you.""']","['approach', 'flood', 'storm henk', 'preventative measures', 'flood protections', 'floodplain', 'flooding', 'existing defences', 'flood resilience', 'deteriorating defences']" +"[""['Share'"", ' ""FOX 5 NY\'s Mike Woods has the details.""']","['higher projections', 'dusting', 'laguardia airport', 'friday night', 'steven yablonski', 'frigid cold', 'snowstorm', 'additional snowfall', 'accumulating snow', 'measurable snow']" +"['[""Heavy snowfall on Jan. 15 caused the metal roof over a dock at Sun Life\'s Concord Marina to collapse', ' resulting in extensive damage.""']","['trucks', 'jason fitz', 'javon bullard', 'afternoon', 'nfl evaluators', 'crystal', 'precaution', 'betting odds', 'orlando', 'heavy snowfall']" +"[""['Next up in 5'"", "" 'Example video title will go here for this video'""]","['browser', 'subscribe', 'tuesday', 'weather', 'attic', 'blaze', 'damage', 'morning', 'firefighters', 'muscatine firefighters']" +"[""['-'"", "" 'Fire crews have dealt with a blaze that has caused extensive damage to a house in North Clare this afternoon.'""]","['blaze', 'damage', 'medical attention', 'breathing apparatus', 'breathing', 'emergency', 'rescue service', 'emergency services', 'firefighters', 'afternoon']" +"[""['Cloudy'"", "" '47'""]","['monday', 'vehicle', 'sunday', 'interactive radar', 'holiday circle', 'roanoke man', 'saturday', 'rescue', 'firefighters', 'kaylee shipley']" +"[""['Text NEWS1 to 256-646-5300 to receive News Alerts from WEIS Radio! Like us on Facebook and follow us on Twitter!'"", "" '(NEW YORK) —\\xa0More than 100 million Americans are on alert Tuesday for dangerous wind chills""]","['avalanche warnings', 'icy conditions', 'afternoon', 'freeze', 'friday morning', '701 days', 'degrees', 'temperatures', '728 days', 'freezing rain']" +"[""['\\n\\n World Brief:\\n \\n\\n Iran Targets Alleged Israeli Spy Headquarters in Iraqi Kurdistan \\n'"", "" 'Create an FP account to save articles to read later and in the FP mobile app.'""]","['beliefs', 'sharp condemnation', 'bases', 'transgender people', 'baghdad', 'transgender', 'switzerland', 'iraqi kurdistan', 'taiwanese voters', 'gibraltar eagle']" +"[""['KANSAS CITY"", "" Mo. (KCTV/Gray News) - A 4-year-old child is dead and five others were injured in a house fire in Missouri Tuesday morning.'""]","['child', 'flames', 'trailer', 'hospital', 'oakland avenue', 'adults', 'morning', 'children', 'emergency', 'firefighters']" +"[""[' Sign In'"", "" ' Subscribe Now'""]","['hotter summers', 'coastal flooding', 'aaron sutherland', 'erosion', 'bigger dykes', 'floodplain exposure', 'catastrophes', 'wildfires', 'natural catastrophes', 'insured damage']" +"[""['\\nBrookings Radio is a part of Alpha Media LLC.© 2024 Alpha Media LLC. All Rights Reserved. ']""]",['radio'] +"[""['Next up in 5'"", "" 'Example video title will go here for this video'""]","['caution', 'heating', 'emergency', 'temperature', 'wwl louisiana', 'flammable material', 'firefighters', 'frozen pipes', 'temperatures', 'freezing temperatures']" +"[""['Authored by Naveen Athrappully via The Epoch Times (emphasis ours)"", ""'""]","['traffic', 'supervisors', 'offensive comments', 'israelis', 'controversy', 'israeli citizens', 'president reagan', 'joyce karam', 'hamas terrorists', 'ceasefire']" +"[""['Snow"", ' sleet and rain gave way to colder temperatures across Massachusetts and beyond Tuesday night']","['uxbridge police', 'emergency crews', 'sunbeam television', 'degrees', 'sudbury river', 'temperatures', 'crashes', 'winter temperatures', 'falling temperatures', 'stormy weather']" +"[""['\\n\\n\\n\\nSign in or Subscribe \\nSee Offers\\n\\n\\n\\n'"", "" ''""]","['cape elizabeth', 'eric laszlo', 'rescue crews', 'mph', 'windstorm', 'minutes', '50 mph', 'low tide', 'choppy waves', 'beach']" +"[""['...'"", ' ""With Beyoncé set to drop her new Cowboy Carter album in a couple of hours']","['conflicts', 'thursday', 'streak', 'armed conflicts', 'couple', 'beyoncé', 'friday', 'rj davis', 'alabama basketball', 'disasters']" +"[""['Next up in 5'"", "" 'Example video title will go here for this video'""]","['noon', 'morning', 'trimet buses', 'friday', 'emergency', 'estimate', 'mph', 'afternoon', '25 mph', 'ice storm']" +"[""['published :\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t17 Jan 2024 at 08:34\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t'"", "" 'writer: \\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tGary Boyle'""]","['cctv footage', 'teenagers', 'camera', 'security cameras', 'suspects', 'motorcycles', 'doubts', 'bloodstains', 'clear doubts', 'panya khongsaengkham']" +"[""['Next up in 5'"", "" 'Example video title will go here for this video'""]","['cases', 'pipes', 'families', 'everybody', 'drinking water', 'crash', 'recovery', 'floor', 'frozen equipment', 'machines']" +"[""['\\n107.3 Mod FM is a part of Alpha Media LLC.© 2024 Alpha Media LLC. All Rights Reserved. ']""]",['mod fm'] +"[""['RAFAH"", "" Gaza Strip (AP) — Palestinian militants battled Israeli forces in devastated northern Gaza and launched a barrage of rockets from farther south Tuesday in a show of force more than 100 days into Israel’s massive air and ground campaign against the tiny coastal enclave.'""]","['evacuation', 'israeli forces', 'rebels', 'bases', 'cease', 'israeli airstrikes', 'baghdad', 'egypt', 'widespread famine', 'disease']" +"[""['We’re sorry"", "" this feature is currently unavailable. We’re working to restore it. Please try again later.'""]","['garden beds', 'beach accesses', 'surveillance', 'beaches', 'beach', 'south australia', 'queensland floodwaters', 'floodwaters', 'western australia', 'mainland australia']" +"[""['Cloudy'"", "" '37'""]","['scene', 'krcg', 'tuesday', 'wednesday', 'interactive radar', 'bartley lane', 'thursday', 'morning', 'remains', 'regan mertz']" diff --git a/DOTS/output/small0_dots_feats.csv b/DOTS/output/small0_dots_feats.csv index 9b4d671..6850cb3 100644 --- a/DOTS/output/small0_dots_feats.csv +++ b/DOTS/output/small0_dots_feats.csv @@ -95,3 +95,83 @@ "[""['Gaza City"", ' Israel (General)']","['widespread famine', 'cease', 'disease']" "[""['Moreton Bay"", ' Queensland']","['western australia', 'floodwaters', 'mainland australia']" "['[""[\'callaway county sheriff office\']""', "" '17-01-2024'""]","['morning', 'regan mertz', 'remains']" +famine,trailblazers,camps,bases,egypt,evacuation,cease,rebels,widespread famine,disease +waste,cataclysms,disasters,eruption,valorizzazione,mount vesuvius,ancient cataclysm,mariano nuzzo,costruzione,beach +district,discussion,firearm,clarendon,old harbour,sunday,new bowen,whatsapp,st catherine,fight +700 days,cancellations,travel delays,school cancellations,streak,snow shovels,meteorologists,snowstorm,accuweather alerts,accuweather meteorologists +vehicle,juvenile,detectives,wbsm,kate robinson,saturday,traffic,ariel dorsey,massachusetts law,conflicting statements +extensive damage,medical attention,breathing apparatus,emergency,damage,afternoon,breathing,rescue service,emergency services,firefighters +checklists,northeast,miles,magnitude,morning,preparedness,watsonville,earthquake,disaster,quake +saleem algerk,bread,early recovery,camps,pumping,cholera cases,neighboring countries,symptom,cholera,mohamad katoub +volcano,grindavík,volcanoes,lava flows,eruption,earthquakes,evacuation,jóhannesson,eruptions,lúðvík pétursson +glaesemann,30th street,39 horses,horses,farnam street,jose galeno,federal courthouse,winter storms,confiscated horses,heavy snow +reward,detective,victim,washington,decision,perpetrator,conviction,michael dorgan,minutes,southern border +shiite muslims,helicopter,revenge,israeli jews,women workers,bases,struggle,civilian ships,cease,ceasefire +stabbings,collapse,hard work,garcetti,volatile times,buy voltas,diktat,sonam wangchuk,poonawalla,underperformer +emotions,rain returns,couple,adrenaline,kitchen,swaying,neighbors,cason wolcott,mixed emotions,philip wolcott +resource,representation,server,mod_security,error +shifa hospital,winds nw,khan yunis,medwish,weeks,mph,israeli soldiers,ahmed kahlot,nominations,israeli interrogators +injuries,comments,eric sweda,sweda,standard procedure,confrontation,dispatcher,dispatchers,meagan drillinger,rachel cavanaugh +inbox,llc +material,occupants,block,day,cause,home,news,vehicle,casasia drive,orlando +surveillance,temperatures,wildfires,laffy taffy,surrounding counties,cooler temperatures,unwanted wildfires,gusty winds,disease,golden eagles +scene,home,evening,water,injuries,fredericktown,monday evening,highway oo,firefighters,temperatures +investigation,lyndaker,structure,rights,cause,department,flames,home,deputies,firefighters +programs,content,commissions,affiliate,retailer,products,links,retailer sites,marketing,hearst television +fishing,eruptive activity,atmosphere,swarm,eruption,earthquakes,machines,enormous clouds,keflavík airport,sundhnúksgígar +temperatures,frost,drizzle,west winds,cold weather,highest temperatures,lowest temperatures,widespread frost,freezing fog,west breezes +sister station,monday,kcra,california,sister,metal,overhang,damage,hearst television,pumps +snowfall,preparedness,vancouver island,afternoon,greater victoria,winter conditions,extreme weather,freezing rain,heavy snow,widespread snow +gospel +famine,heavy fighting,matthew wright,bases,rebels,israeli airstrikes,evacuation,baghdad,widespread famine,disease +opportunity,wednesday,newsmax,weather,votes,march,night,tuesday night,vote,shutdown +hometowns,new york,accomplice,damage,morning,stanivukovic,mclain street,federal charges,explosion,dispute +block,command,solution,page,phrase,word,ray id,attacks,malformed data,online attacks +search,whitelist,rescue service,secondary schools,rescue services,firefighters,dublin castle,afternoon,near miss,evacuation +site,button,page,browser +texas,calls,witnesses,debris,restaurant,conversation,fatalities,floor,atmos energy,explosion +flooding,flood advisory,mph,40 mph,35 mph,daytime highs,gusty winds,heavy snow,widespread snow,blowing snow +alaska airlines,cancellations,differences,flight disruption,tax brackets,wind chills,reagan airport,degrees,winter weather,severe weather +fri,redding,thu,vehicle,interactive radar,ashley gardner,completion,thursday,friday,firefighters +parks,opportunities,envisionwise technology,destination,trees,sledding,vincent moore,windstorm,gates,willamette week +boats,blaze,coast guard,large flames,leathead road,visible flame,emergency,firefighters,emergency crews,okanagan lake +half,headaches,headache,chaos,dulles international,bare pavement,victims,husbands,search,reagan national +protesters,hanukkah,december,los angeles,fm corvallis,ceremony,jewish voice,jewish portlanders,ceasefire,minneapolis +wine,trees,services,industry body,rainfall,emergency,grape,emergency services,afternoon,thunderstorm +death,smoke,result,victim,montgomery,morning,truck,search,rescue,faster +damage,smoke damage,morning,heating,lamp,emergency,traffic,required fields,firefighters,nominee +retailer sites,aromas,miles,magnitude,hearst television,damage,morning,earthquake,quake,watsonville +baluchistan,hosting insurgents,islamic state,provincial resources,baluch nationalists,baluchistan province,behest,insurgency,bases,switzerland +officials,kwqc,rights,scene,iowa,sivyer steel,plant,september,trucks,afternoon +evacuation,israeli forces,rebels,bases,israeli airstrikes,cease,baghdad,egypt,widespread famine,disease +saturday,court documents,february,robbers,red sneakers,robbery,kevin schuster,surveillance,rensselaer,rensselaer man +nfl evaluators,freezing,crystal,windy conditions,freezing surfaces,betting odds,temperatures,winter weather,orlando,freezing rain +prevention,weeks,snowpack conditions,record rainfall,princeton university,damaging storm,winter storms,additional snow,significant snow,wet snow +blaze,damage,kaitangata,coal dust,embers,roofing iron,morning,parents,january,firefighters +smoke,wkrc,result,tuesday,interactive radar,fires,critical condition,pleasant ridge,floor,medicine +case,suspicious pasture,fires,john weda,weda,ranches,100 yards,grass,hay bales,circumstances +news,gardaí,ukraine,war,services,families,lanesboro convent,emergency,emergency services,refugees +kilometres,centimetres,10 centimetres,greater victoria,rain forecasts,wind chill,25 centimetres,snowstorm,winter storm,saskatchewan +approach,flood,storm henk,preventative measures,flood protections,floodplain,flooding,existing defences,flood resilience,deteriorating defences +higher projections,dusting,laguardia airport,friday night,steven yablonski,frigid cold,snowstorm,additional snowfall,accumulating snow,measurable snow +trucks,jason fitz,javon bullard,afternoon,nfl evaluators,crystal,precaution,betting odds,orlando,heavy snowfall +browser,subscribe,tuesday,weather,attic,blaze,damage,morning,firefighters,muscatine firefighters +blaze,damage,medical attention,breathing apparatus,breathing,emergency,rescue service,emergency services,firefighters,afternoon +monday,vehicle,sunday,interactive radar,holiday circle,roanoke man,saturday,rescue,firefighters,kaylee shipley +avalanche warnings,icy conditions,afternoon,freeze,friday morning,701 days,degrees,temperatures,728 days,freezing rain +beliefs,sharp condemnation,bases,transgender people,baghdad,transgender,switzerland,iraqi kurdistan,taiwanese voters,gibraltar eagle +child,flames,trailer,hospital,oakland avenue,adults,morning,children,emergency,firefighters +hotter summers,coastal flooding,aaron sutherland,erosion,bigger dykes,floodplain exposure,catastrophes,wildfires,natural catastrophes,insured damage +radio +caution,heating,emergency,temperature,wwl louisiana,flammable material,firefighters,frozen pipes,temperatures,freezing temperatures +traffic,supervisors,offensive comments,israelis,controversy,israeli citizens,president reagan,joyce karam,hamas terrorists,ceasefire +uxbridge police,emergency crews,sunbeam television,degrees,sudbury river,temperatures,crashes,winter temperatures,falling temperatures,stormy weather +cape elizabeth,eric laszlo,rescue crews,mph,windstorm,minutes,50 mph,low tide,choppy waves,beach +conflicts,thursday,streak,armed conflicts,couple,beyoncé,friday,rj davis,alabama basketball,disasters +noon,morning,trimet buses,friday,emergency,estimate,mph,afternoon,25 mph,ice storm +cctv footage,teenagers,camera,security cameras,suspects,motorcycles,doubts,bloodstains,clear doubts,panya khongsaengkham +cases,pipes,families,everybody,drinking water,crash,recovery,floor,frozen equipment,machines +mod fm +evacuation,israeli forces,rebels,bases,cease,israeli airstrikes,baghdad,egypt,widespread famine,disease +garden beds,beach accesses,surveillance,beaches,beach,south australia,queensland floodwaters,floodwaters,western australia,mainland australia +scene,krcg,tuesday,wednesday,interactive radar,bartley lane,thursday,morning,remains,regan mertz diff --git a/DOTS/output/test_gnews_dots_feats.csv b/DOTS/output/test_gnews_dots_feats.csv new file mode 100644 index 0000000..f2f5dfb --- /dev/null +++ b/DOTS/output/test_gnews_dots_feats.csv @@ -0,0 +1,2 @@ +escalation,agency inputs,concerns,fatalities,chief minister,87 fatalities,chinese citizens,previous attacks,february,ceasefire +families,famine,extreme suffering,carbohydrates,emergency levels,acute malnutrition,shocks,catastrophic hunger,catastrophe,disease diff --git a/DOTS/pull.py b/DOTS/pull.py index 667b062..951b635 100644 --- a/DOTS/pull.py +++ b/DOTS/pull.py @@ -40,15 +40,6 @@ def process_hit(hit): text.append(p.get_text()) return date,loc,title,org,per,theme,text,url - -def process_hit_with_timeout(hit): - try: - return process_hit(hit) - except: - logging.debug(f"Grabbing the url stalled after 5s, skipping...") - return None - - def process_data(data,fast=1): articles = [] results=[] @@ -159,7 +150,7 @@ def pull_data(articles): except: df = pd.DataFrame(data, columns=['title','id','url','title2']) with concurrent.futures.ThreadPoolExecutor() as executor: - df['text'] = list(tqdm(executor.map(process_url, df['url']), total=len(df['url']))) + df['text'] = list(tqdm(executor.map(process_url, df['url']), total=len(df['url']),desc="grabbing text from url")) return df['text'].values.tolist() diff --git a/DOTS/scrape.py b/DOTS/scrape.py index 31ce1e1..579a77f 100644 --- a/DOTS/scrape.py +++ b/DOTS/scrape.py @@ -10,11 +10,15 @@ os_url = os.getenv('OS_TOKEN') lobstr_key = os.getenv('LOBSTR_KEY') -def get_OS_data(n): +def get_OS_data(n=20): bash_command = f""" - curl -X GET "{os_url}/emergency-management-news/_search" -H 'Content-Type: application/json' -d '{{ + curl -X GET "{os_url}/emergency-management-news/_search?scroll=1m" -H 'Content-Type: application/json' -d '{{ "_source": ["metadata.GDELT_DATE", "metadata.page_title","metadata.DocumentIdentifier", "metadata.Organizations","metadata.Persons","metadata.Themes","metadata.text", "metadata.Locations"], "size": {n}, + "slice": {{ + "id": 0, + "max": 10 + }}, "query": {{ "bool": {{ "must": [ @@ -29,30 +33,16 @@ def get_OS_data(n): data = json.loads(output) return data -def get_gnews_data(n): - bash_command = f""" - curl -X GET "{os_url}/test-google-news-index/_search" -H 'Content-Type: application/json' '{{ - "_source": ["metadata.link", "metadata.title"], - "size": {n}, - "query": {{ - "bool": {{ - "must": [ - {{"match_all": {{}}}} - ] - }} - }} - }}' - """ - process = subprocess.run(bash_command, shell=True, capture_output=True, text=True) - output = process.stdout - data = json.loads(output) - return data -def get_test_gnews(n): +def get_test_gnews(n=20): bash_command = f""" - curl -X GET "{os_url}/test-google-news-index/_search" '{{ + curl -X GET "{os_url}/test-google-news-index/_search?scroll=1m" '{{ "_source": ["metadata.link", "metadata.title"], "size": {n}, + "slice": {{ + "id": 0, + "max": 100 + }}, "query": {{ "bool": {{ "must": [ @@ -67,84 +57,28 @@ def get_test_gnews(n): data = json.loads(output) return data +# def get_npr_news(p): +# # Send a GET request to the NPR API +# r = requests.get("http://api.=1m.org/query?apiKey="+npr_key[0], params=p) -def get_massive_OS_data(t=1): - client = OpenSearch(os_url) - query = { - "size": "100", - "timeout": "10s", - "slice": { - "id": 0, - "max": 10 - }, - "query": { - "bool": { - "must": [ - {"match_all": {}}, - ]} - }, - "_source": ["metadata.GDELT_DATE", "metadata.page_title","metadata.DocumentIdentifier", "metadata.Organizations","metadata.Persons","metadata.Themes","metadata.text", "metadata.Locations"], - } - response = client.search( - scroll=str(t)+'m', - body=query, - ) - - return response, client - -def get_google_news(theme,n=10000): - google_news = GNews() - - google_news.period = '7d' # News from last 7 days - google_news.max_results = n # number of responses across a keyword - # google_news.country = 'United States' # News from a specific country - google_news.language = 'english' # News in a specific language - google_news.exclude_websites = ['yahoo.com', 'cnn.com'] # Exclude news from specific website i.e Yahoo.com and CNN.com - # google_news.start_date = (2024, 1, 1) # Search from 1st Jan 2020 - # google_news.end_date = (2024, 3, 1) # Search until 1st March 2020 - - json_resp = google_news.get_news(theme) - article=[] - - for i in tqdm(range(len(json_resp)), desc="grabbing directly from GoogleNews"): - aa=(google_news.get_full_article(json_resp[i]['url'])) - try: - date=aa.publish_date.strftime("%d-%m-%Y") - except: - date=None - try: - title=aa.title - text=aa.text - except: - title=None - text=None - article.append([title,date,text]) - - return article - - -def get_npr_news(p): - # Send a GET request to the NPR API - r = requests.get("http://api.=1m.org/query?apiKey="+npr_key[0], params=p) - - # Parse the XML response to get the story URLs - root = ET.fromstring(r.content) - story_urls = [story.find('link').text for story in root.iter('story')] +# # Parse the XML response to get the story URLs +# root = ET.fromstring(r.content) +# story_urls = [story.find('link').text for story in root.iter('story')] - # For each story URL, send a GET request to get the HTML content - full_stories = [] - for url in story_urls: - response = requests.get(url) - soup = BeautifulSoup(response.text, 'html.parser') +# # For each story URL, send a GET request to get the HTML content +# full_stories = [] +# for url in story_urls: +# response = requests.get(url) +# soup = BeautifulSoup(response.text, 'html.parser') - # Find the main content of the story. This will depend on the structure of the webpage. - # Here, we're assuming that the main content is in a

tag. You might need to adjust this depending on the webpage structure. - story = soup.find_all('p') +# # Find the main content of the story. This will depend on the structure of the webpage. +# # Here, we're assuming that the main content is in a

tag. You might need to adjust this depending on the webpage structure. +# story = soup.find_all('p') - # Extract the text from the story - full_story = ' '.join(p.text for p in story) - full_stories.append(full_story) - return full_stories +# # Extract the text from the story +# full_story = ' '.join(p.text for p in story) +# full_stories.append(full_story) +# return full_stories def scrape_lobstr(): subprocess.run([ diff --git a/demo/dots_feat.ipynb b/demo/dots_feat.ipynb index 2094870..3330ca4 100644 --- a/demo/dots_feat.ipynb +++ b/demo/dots_feat.ipynb @@ -3098,14 +3098,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 20/20 [00:03<00:00, 5.34it/s]\n" + "100%|██████████| 20/20 [00:03<00:00, 5.54it/s]\n" ] } ], "source": [ - "# response = get_OS_data(20)\n", - "# hits = response[\"hits\"][\"hits\"]\n", - "# article = pull_data(hits)\n" + "response = get_OS_data()\n", + "hits = response[\"hits\"][\"hits\"]\n", + "article = pull_data(hits)\n" ] }, { @@ -3114,116 +3114,79 @@ "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 10/10 [00:05<00:00, 1.93it/s]\n" - ] + "data": { + "text/plain": [ + "'[\\'On the night of Thursday 11-Friday 12 January the US and UK militaries bombed sites in Yemen controlled by the Houthi movement. The stated aim of the action, according to a UK spokesperson, was to, “protect vessels and freedom of navigation [in the Red Sea],” by destroying Houthi military bases, radar installations, drone and missile launch sites.\\', \\'The US claimed to have hit 28 targets with 150 missiles. The Houthis say there were 72 strikes and five of their fighters were killed. They promised to take revenge on the US.\\', \\'The relatively small death toll, if accurate, probably indicates that the sites the US hit had been largely evacuated. The US issued a warning to the Houthis, in advance, and probably has not so far done much damage to Houthi military infrastructure.\\', \\'Since October the Houthis have launched missile and drone strikes on the Israeli port of Eliat and international shipping in the Red Sea in support of Hamas in Gaza.\\', \\'On 19 November Houthi troops landed by helicopter on Galaxy Leader, a Japanese-operated cargo ship linked to an Israeli billionaire. The hijacked ship was taken to Salif, a port in Western Yemen. \\', \\'The Houthis have conducted 27 attacks on civilian ships since mid-October and on the evening of Tuesday 9 January they launched 18 drones and three missiles at UK and US warships which had been sent to the Red Sea to defend the shipping lanes.\\', \\'Cargo has been re-routed, and complex supply chains have been disrupted, including, for example, Tesla and Volvo car production in Europe which depends on components shipped from Asia.\\', \\'The Houthis are Shiite Muslims based in the north of Yemen who currently control most of the inhabited areas of the country, including the capital, Sana’a, which they overran in 2014. The ultra-reactionary movement’s main slogan is, “God is the greatest, Death to America, Death to Israel, Curse the Jews.”\\', \\'The Houthis are aligned with Iran, which supplies them with weaponry and training, although they are from different branches of Shiia Islam and are not natural close allies. The Houthis retain their own independent command structures, have a distinct set of priorities, and are not Iranian state puppets.\\', \"The Houthis\\' campaign to support Hamas by targeting ships in the Red Sea is to be condemned. Disrupting the world economy in this way, for these reasons, is not a progressive act “against imperialism” or a “defence of the Palestinians.” These are actions of a religious-military dictatorship motivated by ideological sympathy for the Hamas pogrom against Israeli Jews.\", \\'But it is not clear that the US-UK military action will help – that it will stop the Houthis, who seem to be looking for a confrontation. The Houthi movement has just emerged from a long-running war with a Saudi-UAE-led coalition. They have built a sophisticated, large and adaptable military machine.\\', \\'The Saudis aimed to smash the Iran-backed movement and launched 25,000 airstrikes on Yemen over seven years after 2015. The Houthis saw off the Saudis and UAE troops, and emerged stronger and much better armed. The Saudi action pushed the Houthis closer to Iran.\\', \\'The US-UK action has strengthened the Houthis’ position inside Yemen. There was an enormous demonstration against the Western bombing in Sana’a on Friday 12 January.\\', \\'The Western powers have also made a regional war more likely.\\', \\'What the US and its allies should do, positively, is openly demand that the Israeli government stops the war in Gaza and offers the Palestinians a free, independent state alongside Israel. That is the right thing for the Palestinians, who are harmed by association with the Houthi movement. A Two States settlement would undercut the Houthis politically.\\', \\'Book review of Über Israel Reden: Eine Deutsche Debatte (Talking about Israel: A German...\\', \\'We want a permanent ceasefire and a peace deal, but an immediate cease-fire of any length...\\', \\'Israeli society is a site of struggle. There are few more direct symbols of those...\\', \\'Women workers in Mahalla textile struck to be included in the minimum wage and for a meal...\\', \\'The Morning Star published Charlotte\\\\\\'s Church’s justification of the slogan \"From the...\\', \"The Alliance for Workers\\' Liberty is an organisation fighting as part of the labour movement for a socialist alternative to both capitalism and Stalinism, based on common ownership and democracy.\", \\'020 7394 8923 \\\\xa0\\\\xa0\\\\xa0\\\\xa0awl@workersliberty.org\\', \\'Copyright 2023 Workers’ Liberty\\', \\'Subscribe to email list\\', \\'This website uses cookies, you can find out more and set your preferences here. By continuing to use this website, you agree to our Privacy Policy and Terms & Conditions.\\']'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "response = get_gnews_data(20)\n", - "hits = response[\"hits\"][\"hits\"]\n", - "article = pull_data(hits)\n" + "str(article[12])" ] }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 50/50 [00:08<00:00, 5.59it/s]\n" + "/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ - "# response = get_gnews_data(20)\n", - "hits = data[\"hits\"][\"hits\"]\n", - "article = pull_data(hits)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from dotenv import load_dotenv\n", - "import subprocess, json, argparse, os,requests\n", - "\n", - "load_dotenv()\n", - "os_url = os.getenv('OS_TOKEN')\n", - "bash_command = f\"\"\"\n", - "curl -X GET \"{os_url}/test-google-news-index/_search?scroll=1m\" '{{\n", - "\"_source\": [\"metadata.link\", \"metadata.title\"],\n", - " \"size\": {20},\n", - " \"query\": {{\n", - " \"bool\": {{\n", - " \"must\": [\n", - " {{\"match_all\": {{}}}}\n", - " ]\n", - " }}\n", - " }}\n", - "}}'\n", - "\"\"\"\n", - "process = subprocess.run(bash_command, shell=True, capture_output=True, text=True)\n", - "output = process.stdout\n", - "data = json.loads(output)" + "from DOTS.feat import featurize_stories\n", + "import graphistry\n", + "import umap" ] }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "50" + "['stalinism',\n", + " 'revenge',\n", + " 'women workers',\n", + " 'ceasefire',\n", + " 'civilian ships',\n", + " 'shiia islam',\n", + " 'israeli jews',\n", + " 'bases',\n", + " 'cease',\n", + " 'struggle']" ] }, - "execution_count": 95, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "hits = data[\"hits\"][\"hits\"]\n", - "len(hits)" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [], - "source": [ - "from DOTS.feat import featurize_stories\n", - "import graphistry\n", - "import umap" + "featurize_stories(str(article[12]), top_k = 10, max_len=512)" ] }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " 0%| | 0/50 [00:00, columns=RangeIndex(start=0, stop=0, step=1)))\n", + "03/29/2024 04:21:28 PM - ------------------------------------------------------------------------------------------03/29/2024 04:21:28 PM - [[ scaling_pipeline_target ]]: FunctionTransformer(func=functools.partial(, columns=Index([], dtype='int64')))\n", + "03/29/2024 04:21:28 PM - ------------------------------------------------------------------------------------------03/29/2024 04:21:28 PM - [[ text_model ]]: None\n", + "03/29/2024 04:21:28 PM - ------------------------------------------------------------------------------------------03/29/2024 04:21:28 PM - [[ text_cols ]]: None\n", + "03/29/2024 04:21:28 PM - Removing `_n` from input X_symbolic DataFrame03/29/2024 04:21:28 PM - process_nodes_dataframes[dirty_cat]03/29/2024 04:21:28 PM - ----------------------------------------03/29/2024 04:21:28 PM - \n", + "-- Setting Encoder Parts from Fit ::03/29/2024 04:21:28 PM - Feature Columns In: Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='object')03/29/2024 04:21:28 PM - Target Columns In: Index([], dtype='object')03/29/2024 04:21:28 PM - ------------------------------------------------------------------------------------------03/29/2024 04:21:28 PM - [[ data_encoder ]]: Empty DataFrame\n", + "Columns: []\n", + "Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]\n", + "03/29/2024 04:21:28 PM - ------------------------------------------------------------------------------------------03/29/2024 04:21:28 PM - [[ label_encoder ]]: Empty DataFrame\n", + "Columns: []\n", + "Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]\n", + "03/29/2024 04:21:28 PM - ------------------------------------------------------------------------------------------03/29/2024 04:21:28 PM - [[ scaling_pipeline ]]: FunctionTransformer(func=functools.partial(, columns=Index([], dtype='object')))\n", + "03/29/2024 04:21:28 PM - ------------------------------------------------------------------------------------------03/29/2024 04:21:28 PM - [[ scaling_pipeline_target ]]: FunctionTransformer(func=functools.partial(, columns=Index([], dtype='object')))\n", + "03/29/2024 04:21:28 PM - ------------------------------------------------------------------------------------------03/29/2024 04:21:28 PM - [[ text_model ]]: None\n", + "03/29/2024 04:21:28 PM - ------------------------------------------------------------------------------------------03/29/2024 04:21:28 PM - [[ text_cols ]]: None\n", + "03/29/2024 04:21:28 PM - -Reusing Existing Node Featurization03/29/2024 04:21:28 PM - * Ignoring target column of shape (17, 0) in UMAP fit, as it is not one dimensional03/29/2024 04:21:28 PM - ------------------------------------------------------------------------------------------03/29/2024 04:21:28 PM - Starting UMAP-ing data of shape (17, 0)" + ] + }, + { + "ename": "ValueError", + "evalue": "at least one array or dtype is required", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[10], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m g2 \u001b[38;5;241m=\u001b[39m g\u001b[38;5;241m.\u001b[39mfeaturize()\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# g2._clustersummary()\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m g3 \u001b[38;5;241m=\u001b[39m \u001b[43mg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mumap\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m g3\u001b[38;5;241m.\u001b[39mdbscan()\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/site-packages/graphistry/umap_utils.py:623\u001b[0m, in \u001b[0;36mUMAPMixin.umap\u001b[0;34m(self, X, y, kind, scale, n_neighbors, min_dist, spread, local_connectivity, repulsion_strength, negative_sample_rate, n_components, metric, suffix, play, encode_position, encode_weight, dbscan, engine, feature_engine, inplace, memoize, verbose, **featurize_kwargs)\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[38;5;66;03m# add the safe coercion here \u001b[39;00m\n\u001b[1;32m 621\u001b[0m X_, y_ \u001b[38;5;241m=\u001b[39m make_safe_gpu_dataframes(X_, y_, res\u001b[38;5;241m.\u001b[39mengine) \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[0;32m--> 623\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mres\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_process_umap\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 624\u001b[0m \u001b[43m \u001b[49m\u001b[43mres\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkind\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmemoize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfeaturize_kwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mumap_kwargs\u001b[49m\n\u001b[1;32m 625\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 627\u001b[0m res\u001b[38;5;241m.\u001b[39m_weighted_adjacency_nodes \u001b[38;5;241m=\u001b[39m res\u001b[38;5;241m.\u001b[39m_weighted_adjacency\n\u001b[1;32m 628\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m res\u001b[38;5;241m.\u001b[39m_xy \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/site-packages/graphistry/umap_utils.py:415\u001b[0m, in \u001b[0;36mUMAPMixin._process_umap\u001b[0;34m(self, res, X_, y_, kind, memoize, featurize_kwargs, verbose, **umap_kwargs)\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m** Fitting UMAP\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mif\u001b[39;00m verbose \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 413\u001b[0m res \u001b[38;5;241m=\u001b[39m res\u001b[38;5;241m.\u001b[39mumap_lazy_init(res, verbose\u001b[38;5;241m=\u001b[39mverbose, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mumap_kwargs_pure)\n\u001b[0;32m--> 415\u001b[0m emb \u001b[38;5;241m=\u001b[39m \u001b[43mres\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_umap_fit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 416\u001b[0m res\u001b[38;5;241m.\u001b[39m_xy \u001b[38;5;241m=\u001b[39m emb\n\u001b[1;32m 417\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/site-packages/graphistry/umap_utils.py:305\u001b[0m, in \u001b[0;36mUMAPMixin._umap_fit_transform\u001b[0;34m(self, X, y, verbose)\u001b[0m\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_umap \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 304\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUMAP is not initialized\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 305\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mumap_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 306\u001b[0m emb \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_umap\u001b[38;5;241m.\u001b[39mtransform(X)\n\u001b[1;32m 307\u001b[0m emb \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_bundle_embedding(emb, index\u001b[38;5;241m=\u001b[39mX\u001b[38;5;241m.\u001b[39mindex)\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/site-packages/graphistry/umap_utils.py:288\u001b[0m, in \u001b[0;36mUMAPMixin.umap_fit\u001b[0;34m(self, X, y, verbose)\u001b[0m\n\u001b[1;32m 286\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_umap\u001b[38;5;241m.\u001b[39mgraph_ \u001b[38;5;241m=\u001b[39m knn\u001b[38;5;241m.\u001b[39mkneighbors_graph(cc\u001b[38;5;241m.\u001b[39membedding_)\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 288\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_umap\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 290\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_weighted_adjacency \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_umap\u001b[38;5;241m.\u001b[39mgraph_\n\u001b[1;32m 291\u001b[0m \u001b[38;5;66;03m# if changing, also update fresh_res\u001b[39;00m\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/site-packages/umap/umap_.py:2354\u001b[0m, in \u001b[0;36mUMAP.fit\u001b[0;34m(self, X, y, force_all_finite)\u001b[0m\n\u001b[1;32m 2328\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfit\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, y\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, force_all_finite\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m):\n\u001b[1;32m 2329\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Fit X into an embedded space.\u001b[39;00m\n\u001b[1;32m 2330\u001b[0m \n\u001b[1;32m 2331\u001b[0m \u001b[38;5;124;03m Optionally use y for supervised dimension reduction.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2351\u001b[0m \u001b[38;5;124;03m Values cannot be infinite.\u001b[39;00m\n\u001b[1;32m 2352\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 2354\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfloat32\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcsr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mC\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mforce_all_finite\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_all_finite\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2355\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raw_data \u001b[38;5;241m=\u001b[39m X\n\u001b[1;32m 2357\u001b[0m \u001b[38;5;66;03m# Handle all the optional arguments, setting default\u001b[39;00m\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/site-packages/sklearn/utils/validation.py:795\u001b[0m, in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 791\u001b[0m pandas_requires_conversion \u001b[38;5;241m=\u001b[39m \u001b[38;5;28many\u001b[39m(\n\u001b[1;32m 792\u001b[0m _pandas_dtype_needs_early_conversion(i) \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m dtypes_orig\n\u001b[1;32m 793\u001b[0m )\n\u001b[1;32m 794\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mall\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(dtype_iter, np\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;28;01mfor\u001b[39;00m dtype_iter \u001b[38;5;129;01min\u001b[39;00m dtypes_orig):\n\u001b[0;32m--> 795\u001b[0m dtype_orig \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult_type\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mdtypes_orig\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 796\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m pandas_requires_conversion \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(d \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mobject\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m d \u001b[38;5;129;01min\u001b[39;00m dtypes_orig):\n\u001b[1;32m 797\u001b[0m \u001b[38;5;66;03m# Force object if any of the dtypes is an object\u001b[39;00m\n\u001b[1;32m 798\u001b[0m dtype_orig \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mobject\u001b[39m\n", + "\u001b[0;31mValueError\u001b[0m: at least one array or dtype is required" + ] + } + ], "source": [ "import pandas as pd\n", "data=pd.DataFrame(flattened_list) # each ranked feature is a row\n", diff --git a/main.py b/main.py index e970e9a..9f397ed 100644 --- a/main.py +++ b/main.py @@ -15,8 +15,8 @@ from DOTS.feat import chunk_text, featurize_stories -from DOTS.scrape import get_OS_data, get_google_news, get_massive_OS_data, get_npr_news, scrape_lobstr -from DOTS.pull import process_hit, process_data, pull_data, process_response, pull_lobstr_gdoc +from DOTS.scrape import get_OS_data, scrape_lobstr,get_test_gnews +from DOTS.pull import process_hit, process_data,process_response, pull_data, pull_lobstr_gdoc logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s') @@ -29,7 +29,7 @@ def _input(): parser.add_argument('-o', type=str, default='dots_feats.csv', help='Output file name') # parser.add_argument('-p', type=int, default=1, help='Parallelize requests') # parser.add_argument('-t', type=int, default=1, help='Scroll Timeout in minutes, if using "d=1" large data set') - parser.add_argument('-d', type=int, default=4, help='0 for a small amount, 1 for large, 2 for google news, 3 for lobstr, 4 for test_gnews') + parser.add_argument('-d', type=int, default=4, help='0 for OS, 1 for test_gnews, 2 for lobstr') # parser.add_argument('-e', type=datetime, default=20231231, help='end date') args, unknown = parser.parse_known_args() return args @@ -53,36 +53,34 @@ def _input(): # Main pipeline def main(args): if args.d == 0: - # data = get_OS_data(args.n) - # articles = process_data(data) response = get_OS_data(args.n) hits = response["hits"]["hits"] articles = pull_data(hits) # articles = process_response(data) dname = 'small0_' - elif args.d == 1: - response, client = get_massive_OS_data(args.t) - pagination_id = response["_scroll_id"] - hits = response["hits"]["hits"] - articles = [] - while len(hits) != 0 and len(articles2) < args.n: - response = client.scroll( - scroll=str(args.t)+'m', - scroll_id=pagination_id - ) - hits = response["hits"]["hits"] - # article = process_data(response) - articles.append(hits) - articles2 = [item for sublist in articles for item in sublist] - articles = [item for sublist in articles for item in sublist] - dname = 'large1_' + # elif args.d == 1: + # response, client = get_massive_OS_data(args.t) + # pagination_id = response["_scroll_id"] + # hits = response["hits"]["hits"] + # articles = [] + # while len(hits) != 0 and len(articles2) < args.n: + # response = client.scroll( + # scroll=str(args.t)+'m', + # scroll_id=pagination_id + # ) + # hits = response["hits"]["hits"] + # # article = process_data(response) + # articles.append(hits) + # articles2 = [item for sublist in articles for item in sublist] + # articles = [item for sublist in articles for item in sublist] + # dname = 'large1_' + # elif args.d == 2: + # articles = get_google_news('disaster') + # dname = 'google2_' elif args.d == 2: - articles = get_google_news('disaster') - dname = 'google2_' - elif args.d == 3: articles = pull_lobstr_gdoc(args.n) dname = 'lobstr3_' - elif args.d == 4: + elif args.d == 1: response = get_test_gnews(args.n) hits = response["hits"]["hits"] articles = pull_data(hits)