AOSSIE-Org · Yashwanth-79 · Jan 10, 2025 · Jan 11, 2025
diff --git a/NewClient/src/popup/App.tsx b/NewClient/src/popup/App.tsx
@@ -6,9 +6,9 @@ import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
 
 type NewsOriginResult = {
-  HIGH?: [string, string][];
-  MINIMAL?: [string, string][];
-  SOME?: [string, string][];
+  HIGH?: string[];
+  MINIMAL?: string[];
+  SOME?: string[];
   error?: string;
 };
 
@@ -89,22 +89,53 @@ function App() {
         {newsOriginResult && !newsOriginResult.error && (
           <div className="mt-2">
             <h4>News Origin Results:</h4>
-            {["HIGH", "MINIMAL", "SOME"].map((category) => (
-              newsOriginResult[category as keyof NewsOriginResult] && newsOriginResult[category as keyof NewsOriginResult]!.length > 0 && (
-                <div key={category} className="mb-2">
-                  <h5>{category} Probability:</h5>
-                  <ul className="list-disc ml-5">
-                    {newsOriginResult[category as keyof NewsOriginResult]!.map(([url, description], index) => (
-                      <li key={index}>
-                        <a href={`https://${url}`} target="_blank" rel="noopener noreferrer">
-                          {url} {description && `- ${description}`}
-                        </a>
-                      </li>
-                    ))}
-                  </ul>
-                </div>
-              )
-            ))}
+
+            {/* Cleanly display the categories */}
+            {newsOriginResult.HIGH && newsOriginResult.HIGH.length > 0 && (
+              <div className="mb-2">
+                <strong>HIGH:</strong>
+                <ul className="list-disc ml-5">
+                  {newsOriginResult.HIGH.map((url, index) => (
+                    <li key={index}>
+                      <a href={`https://${url}`} target="_blank" rel="noopener noreferrer">
+                        {url}
+                      </a>
+                    </li>
+                  ))}
+                </ul>
+              </div>
+            )}
+
+            {newsOriginResult.MINIMAL && newsOriginResult.MINIMAL.length > 0 && (
+              <div className="mb-2">
+                <strong>MINIMAL:</strong>
+                <ul className="list-disc ml-5">
+                  {newsOriginResult.MINIMAL.map((url, index) => (
+                    <li key={index}>
+                      <a href={`https://${url}`} target="_blank" rel="noopener noreferrer">
+                        {url}
+                      </a>
+                    </li>
+                  ))}
+                </ul>
+              </div>
+            )}
+
+            {newsOriginResult.SOME && newsOriginResult.SOME.length > 0 && (
+              <div className="mb-2">
+                <strong>SOME:</strong>
+                <ul className="list-disc ml-5">
+                  {newsOriginResult.SOME.map((url, index) => (
+                    <li key={index}>
+                      <a href={`https://${url}`} target="_blank" rel="noopener noreferrer">
+                        {url}
+                      </a>
+                    </li>
+                  ))}
+                </ul>
+              </div>
+            )}
+
             <Button variant="destructive" onClick={handleDeleteNewsOrigin}>
               Delete News Origin Results
             </Button>
@@ -150,4 +181,4 @@ function App() {
   );
 }
 
-export default App;
+export default App;
diff --git a/server/Dockerfile b/server/Dockerfile
@@ -17,6 +17,7 @@ COPY fakeNews/predictions/requirements.txt /app/fakeNews/predictions/requirement
 COPY imageAPI/requirements.txt /app/imageAPI/requirements.txt
 COPY ReportAPI/requirements.txt /app/ReportAPI/requirements.txt
 COPY Summarizer/requirements.txt /app/Summarizer/requirements.txt
+COPY News_Origin/requirements.txt /app/News_Origin/requirements.txt
 
 RUN pip install --no-cache-dir -r /app/Hate_Speech/requirements.txt && \
     pip install --no-cache-dir -r /app/Click-Bait/requirements.txt && \
@@ -26,6 +27,7 @@ RUN pip install --no-cache-dir -r /app/Hate_Speech/requirements.txt && \
     pip install --no-cache-dir -r /app/fakeNews/predictions/requirements.txt && \
     pip install --no-cache-dir -r /app/imageAPI/requirements.txt && \
     pip install --no-cache-dir -r /app/ReportAPI/requirements.txt && \
+    pip install --no-cache-dir -r /app/News_Origin/requirements.txt && \
     pip install --no-cache-dir -r /app/Summarizer/requirements.txt
 
 # Copy all applications' code into the container
@@ -38,9 +40,10 @@ COPY fakeNews /app/fakeNews/predictions
 COPY imageAPI /app/imageAPI
 COPY ReportAPI /app/ReportAPI
 COPY Summarizer /app/Summarizer
+COPY News_Origin /app/News_Origin
 
 # Expose ports for Flask services
-EXPOSE 5001 5002 5003 5004 5005 5006 5007 5008
+EXPOSE 5001 5002 5003 5004 5005 5006 5007 5008 5009
 
 # Default command is to start the app (can be overridden by Docker Compose)
-CMD ["python", "app.py"]
+CMD ["python", "app.py"]
diff --git a/server/News_Origin/app.py b/server/News_Origin/app.py
@@ -3,165 +3,134 @@
 from flask_cors import CORS
 from flask_swagger_ui import get_swaggerui_blueprint
 import nltk
-import re
-from collections import defaultdict
-from pattern.en import ngrams
-from pattern.web import Google
+from nltk.util import ngrams as nltk_ngrams
 from nltk.corpus import stopwords
-from nltk import ne_chunk, pos_tag
-import pandas as pd
+from nltk.tokenize import word_tokenize
+import requests
 from dotenv import load_dotenv
+import pandas as pd
+from collections import defaultdict
+import re
+import tldextract
+import logging
+# Download required NLTK data
 nltk.download('stopwords')
 nltk.download('averaged_perceptron_tagger_eng')
 nltk.download('punkt_tab')
 nltk.download('wordnet')
 nltk.download('maxent_ne_chunker')
 nltk.download('maxent_ne_chunker_tab')
-nltk.download('stopwords')
 nltk.download('words')
 
-# Load environment variables from .env file
+# Load environment variables
 load_dotenv()
 
-# Environment variables
-NLTK_DATA_PATH = os.getenv('NLTK_DATA_PATH')
-API_KEY = os.getenv('API_KEY')
-
-# Set NLTK data path
-# nltk.data.path = [NLTK_DATA_PATH]
-
+# Flask setup
 app = Flask(__name__)
-app.config['TESTING'] = True
 app.debug = True
 
-# Swagger configuration
+# Swagger setup
 SWAGGER_URL = '/swagger'
 API_URL = '/static/swagger.json'
 SWAGGERUI_BLUEPRINT = get_swaggerui_blueprint(
-    SWAGGER_URL,
-    API_URL,
-    config={'app_name': "Social Street Smart - News Origin"}
+    SWAGGER_URL, API_URL, config={'app_name': "Social Street Smart - News Origin"}
 )
 app.register_blueprint(SWAGGERUI_BLUEPRINT, url_prefix=SWAGGER_URL)
 
 # CORS configuration
 CORS(app)
 
-@app.route('/')
-def hello_world():
-    return 'Hello, World!'
-
-@app.route('/pred', methods=['GET', 'POST'])
-def predict():
-    text = request.args.get('text') if request.method == 'GET' else request.form.get('text')
-    key = request.args.get('key', API_KEY)
-
-    if not key or len(key) != 39:
-        key = API_KEY
-
-    sc = SourceChecker(text, 'english', key)
-    queries = sc.get_queries()
-    domains = sc.get_urls(queries)
-    sc.load_domains()
-    result = sc.render_output(domains)
+# Environment variables
+API_KEY = os.getenv("API_KEY") # Replace with ur Key from https://developers.google.com/custom-search/v1/introduction
+CSE_ID = os.getenv("CSE_ID") # Replace with your valid CSE ID from  https://programmablesearchengine.google.com/controlpanel/all
 
-    return jsonify(result), 200
 
 class SourceChecker:
-    def __init__(self, text, language, key, max_queries=8, span=8, threshold=0.7):
+    def __init__(self, text, max_queries=8, span=8):
         self.text = text
-        self.language = language
-        self.key = key
         self.max_queries = max_queries
         self.span = span
-        self.threshold = threshold
         self.cat_dict = defaultdict(list)
-        self.engine = Google(license=key, throttle=0.8, language=None)
 
     def get_queries(self):
-        """Extract search queries from the text."""
-        text = self.text.replace('--', 'DOUBLEDASH')
-        all_ngrams = ngrams(text, n=self.span, punctuation="", continuous=True)
-
-        stop_words = stopwords.words(self.language) if self.language in stopwords.fileids() else []
+        """Generate meaningful n-gram queries."""
+        words = word_tokenize(self.text)
         queries = []
 
-        for ngram in all_ngrams:
-            stop_score = sum([w in stop_words for w in ngram]) / len(ngram)
-            ent_score = 0
+        for span in range(4, self.span + 1):  # Start from 4-word phrases
+            for ngram in nltk_ngrams(words, n=span):
+                r_string = " ".join(ngram)
+                if len(r_string.split()) >= 4:  # Minimum meaningful query length
+                    queries.append(r_string)
 
-            if self.language == 'english':
-                chunked = ne_chunk(pos_tag(ngram))
-                named_entities = [chunk for chunk in chunked if isinstance(chunk, nltk.Tree)]
-                ent_score = len(named_entities) / len(ngram)
+        return list(dict.fromkeys(queries[:self.max_queries]))  # Deduplicate and limit queries
+
+    def search_google(self, query):
+        """Search Google Custom Search API."""
+        url = "https://www.googleapis.com/customsearch/v1"
+        params = {
+            'key': API_KEY,
+            'cx': CSE_ID,
+            'q': query,
+            'num': 10,
+        }
+        try:
+            response = requests.get(url, params=params)
+            if response.status_code == 200:
+                return response.json().get('items', [])
+            logging.error(f"Google API error: {response.status_code} {response.text}")
+            return []
+        except Exception as e:
+            logging.error(f"Error during Google search: {e}")
+            return []
+
+    def render_output(self, domains):
+        """Render results."""
+        output = defaultdict(list)
+
+        for domain, queries in domains.items():
+            overlap = len(queries) / self.max_queries
+            if overlap >= 0.6:
+                output['HIGH'].append(domain)
+            elif overlap >= 0.4:
+                output['SOME'].append(domain)
+            elif overlap >= 0.2:
+                output['MINIMAL'].append(domain)
+
+        return dict(output)
 
-            if stop_score < self.threshold and ent_score < self.threshold:
-                r_string = self.reconstruct_ngram(ngram)
-                if r_string in self.text:
-                    queries.append(r_string)
 
-        reduction = len(queries) // self.max_queries
-        return queries[:len(queries):reduction] if reduction else queries
-
-    def reconstruct_ngram(self, ngram):
-        """Reconstruct original substrings from the ngrams."""
-        punc_b = ['!', '?', '.', ',', ';', ':', '\'', ')', ']', '}']
-        punc_a = ['(', '[', '}', '$']
-        ngram = ' '.join(ngram)
-        for p in punc_b:
-            ngram = ngram.replace(' ' + p, p)
-        for p in punc_a:
-            ngram = ngram.replace(p + ' ', p)
-        ngram = re.sub('(^| )BEGQ', ' "', ngram)
-        ngram = re.sub('ENDQ($| )', '" ', ngram)
-        return ngram.replace('DOUBLEDASH', '--')
-
-    def load_domains(self):
-        """Load domain information from CSV using pandas."""
-        sources_path = 'origin_api/static/data/news_websites.csv'
-        df = pd.read_csv(sources_path)
-        for index, row in df.iterrows():
-            url = row[2]
-            cats = "".join(str(row[3]))
-            self.cat_dict[url] = cats
-
-    def get_urls(self, queries):
-        """Run search queries through Google API and collect returned domain information."""
+@app.route('/pred', methods=['GET'])
+def predict():
+    try:
+        text = request.args.get('text')
+        if not text:
+            return jsonify({"error": "No text provided"}), 400
+
+        sc = SourceChecker(text)
+        queries = sc.get_queries()
+
+        if not queries:
+            return jsonify({"error": "No valid queries generated"}), 400
+
         domains = defaultdict(list)
-        for q in queries:
-            results = self.engine.search(f'"{q}"')
+        for query in queries:
+            results = sc.search_google(query)
             for result in results:
-                domain = self.get_domain(result.url)
-                domains[domain].append(q)
-        return domains
+                link = result.get('link', '')
+                if link:
+                    extracted = tldextract.extract(link)
+                    domain = f"{extracted.domain}.{extracted.suffix}"
+                    if domain:
+                        domains[domain].append(query)
 
-    def get_domain(self, full_url):
-        """Extract the domain name from the URL."""
-        clean_reg = re.compile(r'^((?:https?:\/\/)?(?:www\.)?).*?(\/.*)?$')
-        match = re.search(clean_reg, full_url)
-        return str.replace(str.replace(full_url, match.group(1), ''), match.group(2), '')
+        result = sc.render_output(domains)
+        return jsonify(result), 200
+
+    except Exception as e:
+        logging.error(f"Error in predict route: {e}")
+        return jsonify({"error": str(e)}), 500
 
-    def render_output(self, domains):
-        """Render text output."""
-        output = defaultdict(list)
-        for d, v in domains.items():
-            d_cats = [c for c in self.cat_dict[d] if len(c) > 0 and len(c.split(' ')) < 3]
-            overlap = len(v) / self.max_queries
-            if 0.2 < overlap < 0.4:
-                output['MINIMAL'].append((d, "".join(d_cats)))
-            elif 0.4 < overlap < 0.6:
-                output['SOME'].append((d, "".join(d_cats)))
-            elif overlap >= 0.6:
-                output['HIGH'].append((d, "".join(d_cats)))
-
-        for deg in ['HIGH', 'SOME', 'MINIMAL']:
-            if output[deg]:
-                print(f'{deg} OVERLAP:')
-                for d, cats in sorted(output[deg]):
-                    print(f'{d}: {cats if cats else ""}')
-                print('\n')
-
-        return output
 
 if __name__ == '__main__':
     app.run(debug=True, port=5000)
diff --git a/server/News_Origin/requirements.txt b/server/News_Origin/requirements.txt
@@ -4,4 +4,9 @@ flask-swagger-ui
 nltk
 pandas
 python-dotenv
-pattern
+requests
+beautifulsoup4
+lxml 
+google-search
+google-api-python-client==2.86.0
+tldextract