Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated and better: News-Origin #65

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 51 additions & 20 deletions NewClient/src/popup/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ import { Button } from "@/components/ui/button";
import { Checkbox } from "@/components/ui/checkbox";

type NewsOriginResult = {
HIGH?: [string, string][];
MINIMAL?: [string, string][];
SOME?: [string, string][];
HIGH?: string[];
MINIMAL?: string[];
SOME?: string[];
error?: string;
};

Expand Down Expand Up @@ -89,22 +89,53 @@ function App() {
{newsOriginResult && !newsOriginResult.error && (
<div className="mt-2">
<h4>News Origin Results:</h4>
{["HIGH", "MINIMAL", "SOME"].map((category) => (
newsOriginResult[category as keyof NewsOriginResult] && newsOriginResult[category as keyof NewsOriginResult]!.length > 0 && (
<div key={category} className="mb-2">
<h5>{category} Probability:</h5>
<ul className="list-disc ml-5">
{newsOriginResult[category as keyof NewsOriginResult]!.map(([url, description], index) => (
<li key={index}>
<a href={`https://${url}`} target="_blank" rel="noopener noreferrer">
{url} {description && `- ${description}`}
</a>
</li>
))}
</ul>
</div>
)
))}

{/* Cleanly display the categories */}
{newsOriginResult.HIGH && newsOriginResult.HIGH.length > 0 && (
<div className="mb-2">
<strong>HIGH:</strong>
<ul className="list-disc ml-5">
{newsOriginResult.HIGH.map((url, index) => (
<li key={index}>
<a href={`https://${url}`} target="_blank" rel="noopener noreferrer">
{url}
</a>
</li>
))}
</ul>
</div>
)}

{newsOriginResult.MINIMAL && newsOriginResult.MINIMAL.length > 0 && (
<div className="mb-2">
<strong>MINIMAL:</strong>
<ul className="list-disc ml-5">
{newsOriginResult.MINIMAL.map((url, index) => (
<li key={index}>
<a href={`https://${url}`} target="_blank" rel="noopener noreferrer">
{url}
</a>
</li>
))}
</ul>
</div>
)}

{newsOriginResult.SOME && newsOriginResult.SOME.length > 0 && (
<div className="mb-2">
<strong>SOME:</strong>
<ul className="list-disc ml-5">
{newsOriginResult.SOME.map((url, index) => (
<li key={index}>
<a href={`https://${url}`} target="_blank" rel="noopener noreferrer">
{url}
</a>
</li>
))}
</ul>
</div>
)}

<Button variant="destructive" onClick={handleDeleteNewsOrigin}>
Delete News Origin Results
</Button>
Expand Down Expand Up @@ -150,4 +181,4 @@ function App() {
);
}

export default App;
export default App;
7 changes: 5 additions & 2 deletions server/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ COPY fakeNews/predictions/requirements.txt /app/fakeNews/predictions/requirement
COPY imageAPI/requirements.txt /app/imageAPI/requirements.txt
COPY ReportAPI/requirements.txt /app/ReportAPI/requirements.txt
COPY Summarizer/requirements.txt /app/Summarizer/requirements.txt
COPY News_Origin/requirements.txt /app/News_Origin/requirements.txt

RUN pip install --no-cache-dir -r /app/Hate_Speech/requirements.txt && \
pip install --no-cache-dir -r /app/Click-Bait/requirements.txt && \
Expand All @@ -26,6 +27,7 @@ RUN pip install --no-cache-dir -r /app/Hate_Speech/requirements.txt && \
pip install --no-cache-dir -r /app/fakeNews/predictions/requirements.txt && \
pip install --no-cache-dir -r /app/imageAPI/requirements.txt && \
pip install --no-cache-dir -r /app/ReportAPI/requirements.txt && \
pip install --no-cache-dir -r /app/News_Origin/requirements.txt && \
pip install --no-cache-dir -r /app/Summarizer/requirements.txt

# Copy all applications' code into the container
Expand All @@ -38,9 +40,10 @@ COPY fakeNews /app/fakeNews/predictions
COPY imageAPI /app/imageAPI
COPY ReportAPI /app/ReportAPI
COPY Summarizer /app/Summarizer
COPY News_Origin /app/News_Origin

# Expose ports for Flask services
EXPOSE 5001 5002 5003 5004 5005 5006 5007 5008
EXPOSE 5001 5002 5003 5004 5005 5006 5007 5008 5009

# Default command is to start the app (can be overridden by Docker Compose)
CMD ["python", "app.py"]
CMD ["python", "app.py"]
203 changes: 86 additions & 117 deletions server/News_Origin/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,165 +3,134 @@
from flask_cors import CORS
from flask_swagger_ui import get_swaggerui_blueprint
import nltk
import re
from collections import defaultdict
from pattern.en import ngrams
from pattern.web import Google
from nltk.util import ngrams as nltk_ngrams
from nltk.corpus import stopwords
from nltk import ne_chunk, pos_tag
import pandas as pd
from nltk.tokenize import word_tokenize
import requests
from dotenv import load_dotenv
import pandas as pd
from collections import defaultdict
import re
import tldextract
import logging
# Download required NLTK data
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('maxent_ne_chunker_tab')
nltk.download('stopwords')
nltk.download('words')

# Load environment variables from .env file
# Load environment variables
load_dotenv()

# Environment variables
NLTK_DATA_PATH = os.getenv('NLTK_DATA_PATH')
API_KEY = os.getenv('API_KEY')

# Set NLTK data path
# nltk.data.path = [NLTK_DATA_PATH]

# Flask setup
app = Flask(__name__)
app.config['TESTING'] = True
app.debug = True

# Swagger configuration
# Swagger setup
SWAGGER_URL = '/swagger'
API_URL = '/static/swagger.json'
SWAGGERUI_BLUEPRINT = get_swaggerui_blueprint(
SWAGGER_URL,
API_URL,
config={'app_name': "Social Street Smart - News Origin"}
SWAGGER_URL, API_URL, config={'app_name': "Social Street Smart - News Origin"}
)
app.register_blueprint(SWAGGERUI_BLUEPRINT, url_prefix=SWAGGER_URL)

# CORS configuration
CORS(app)

@app.route('/')
def hello_world():
return 'Hello, World!'

@app.route('/pred', methods=['GET', 'POST'])
def predict():
text = request.args.get('text') if request.method == 'GET' else request.form.get('text')
key = request.args.get('key', API_KEY)

if not key or len(key) != 39:
key = API_KEY

sc = SourceChecker(text, 'english', key)
queries = sc.get_queries()
domains = sc.get_urls(queries)
sc.load_domains()
result = sc.render_output(domains)
# Environment variables
API_KEY = os.getenv("API_KEY") # Replace with ur Key from https://developers.google.com/custom-search/v1/introduction
CSE_ID = os.getenv("CSE_ID") # Replace with your valid CSE ID from https://programmablesearchengine.google.com/controlpanel/all

return jsonify(result), 200

class SourceChecker:
def __init__(self, text, language, key, max_queries=8, span=8, threshold=0.7):
def __init__(self, text, max_queries=8, span=8):
self.text = text
self.language = language
self.key = key
self.max_queries = max_queries
self.span = span
self.threshold = threshold
self.cat_dict = defaultdict(list)
self.engine = Google(license=key, throttle=0.8, language=None)

def get_queries(self):
"""Extract search queries from the text."""
text = self.text.replace('--', 'DOUBLEDASH')
all_ngrams = ngrams(text, n=self.span, punctuation="", continuous=True)

stop_words = stopwords.words(self.language) if self.language in stopwords.fileids() else []
"""Generate meaningful n-gram queries."""
words = word_tokenize(self.text)
queries = []

for ngram in all_ngrams:
stop_score = sum([w in stop_words for w in ngram]) / len(ngram)
ent_score = 0
for span in range(4, self.span + 1): # Start from 4-word phrases
for ngram in nltk_ngrams(words, n=span):
r_string = " ".join(ngram)
if len(r_string.split()) >= 4: # Minimum meaningful query length
queries.append(r_string)

if self.language == 'english':
chunked = ne_chunk(pos_tag(ngram))
named_entities = [chunk for chunk in chunked if isinstance(chunk, nltk.Tree)]
ent_score = len(named_entities) / len(ngram)
return list(dict.fromkeys(queries[:self.max_queries])) # Deduplicate and limit queries

def search_google(self, query):
"""Search Google Custom Search API."""
url = "https://www.googleapis.com/customsearch/v1"
params = {
'key': API_KEY,
'cx': CSE_ID,
'q': query,
'num': 10,
}
try:
response = requests.get(url, params=params)
if response.status_code == 200:
return response.json().get('items', [])
logging.error(f"Google API error: {response.status_code} {response.text}")
return []
except Exception as e:
logging.error(f"Error during Google search: {e}")
return []

def render_output(self, domains):
"""Render results."""
output = defaultdict(list)

for domain, queries in domains.items():
overlap = len(queries) / self.max_queries
if overlap >= 0.6:
output['HIGH'].append(domain)
elif overlap >= 0.4:
output['SOME'].append(domain)
elif overlap >= 0.2:
output['MINIMAL'].append(domain)

return dict(output)

if stop_score < self.threshold and ent_score < self.threshold:
r_string = self.reconstruct_ngram(ngram)
if r_string in self.text:
queries.append(r_string)

reduction = len(queries) // self.max_queries
return queries[:len(queries):reduction] if reduction else queries

def reconstruct_ngram(self, ngram):
"""Reconstruct original substrings from the ngrams."""
punc_b = ['!', '?', '.', ',', ';', ':', '\'', ')', ']', '}']
punc_a = ['(', '[', '}', '$']
ngram = ' '.join(ngram)
for p in punc_b:
ngram = ngram.replace(' ' + p, p)
for p in punc_a:
ngram = ngram.replace(p + ' ', p)
ngram = re.sub('(^| )BEGQ', ' "', ngram)
ngram = re.sub('ENDQ($| )', '" ', ngram)
return ngram.replace('DOUBLEDASH', '--')

def load_domains(self):
"""Load domain information from CSV using pandas."""
sources_path = 'origin_api/static/data/news_websites.csv'
df = pd.read_csv(sources_path)
for index, row in df.iterrows():
url = row[2]
cats = "".join(str(row[3]))
self.cat_dict[url] = cats

def get_urls(self, queries):
"""Run search queries through Google API and collect returned domain information."""
@app.route('/pred', methods=['GET'])
def predict():
try:
text = request.args.get('text')
if not text:
return jsonify({"error": "No text provided"}), 400

sc = SourceChecker(text)
queries = sc.get_queries()

if not queries:
return jsonify({"error": "No valid queries generated"}), 400

domains = defaultdict(list)
for q in queries:
results = self.engine.search(f'"{q}"')
for query in queries:
results = sc.search_google(query)
for result in results:
domain = self.get_domain(result.url)
domains[domain].append(q)
return domains
link = result.get('link', '')
if link:
extracted = tldextract.extract(link)
domain = f"{extracted.domain}.{extracted.suffix}"
if domain:
domains[domain].append(query)

def get_domain(self, full_url):
"""Extract the domain name from the URL."""
clean_reg = re.compile(r'^((?:https?:\/\/)?(?:www\.)?).*?(\/.*)?$')
match = re.search(clean_reg, full_url)
return str.replace(str.replace(full_url, match.group(1), ''), match.group(2), '')
result = sc.render_output(domains)
return jsonify(result), 200

except Exception as e:
logging.error(f"Error in predict route: {e}")
return jsonify({"error": str(e)}), 500

def render_output(self, domains):
"""Render text output."""
output = defaultdict(list)
for d, v in domains.items():
d_cats = [c for c in self.cat_dict[d] if len(c) > 0 and len(c.split(' ')) < 3]
overlap = len(v) / self.max_queries
if 0.2 < overlap < 0.4:
output['MINIMAL'].append((d, "".join(d_cats)))
elif 0.4 < overlap < 0.6:
output['SOME'].append((d, "".join(d_cats)))
elif overlap >= 0.6:
output['HIGH'].append((d, "".join(d_cats)))

for deg in ['HIGH', 'SOME', 'MINIMAL']:
if output[deg]:
print(f'{deg} OVERLAP:')
for d, cats in sorted(output[deg]):
print(f'{d}: {cats if cats else ""}')
print('\n')

return output

if __name__ == '__main__':
app.run(debug=True, port=5000)
7 changes: 6 additions & 1 deletion server/News_Origin/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,9 @@ flask-swagger-ui
nltk
pandas
python-dotenv
pattern
requests
beautifulsoup4
lxml
google-search
google-api-python-client==2.86.0
tldextract