From 16d4ca76edc08c57509ac1d33aa597990d7eda23 Mon Sep 17 00:00:00 2001
From: Puppuccino <97849040+CrazyDubya@users.noreply.github.com>
Date: Wed, 26 Jun 2024 00:56:57 -0400
Subject: [PATCH] Create Claude-chat-gptarchive-public
Analyze your data to get hard stats saved then load those stats into a chat with Claude
---
.../Claude-chat-gptarchive-public | 516 ++++++++++++++++++
1 file changed, 516 insertions(+)
create mode 100644 ChatGPTArchive/StatsplusClaude/Claude-chat-gptarchive-public
diff --git a/ChatGPTArchive/StatsplusClaude/Claude-chat-gptarchive-public b/ChatGPTArchive/StatsplusClaude/Claude-chat-gptarchive-public
new file mode 100644
index 0000000..aaf46a0
--- /dev/null
+++ b/ChatGPTArchive/StatsplusClaude/Claude-chat-gptarchive-public
@@ -0,0 +1,516 @@
+import csv
+import hashlib
+import json
+import os
+import re
+import string
+from collections import defaultdict, Counter
+from typing import Dict, List, Any, Tuple
+
+import networkx as nx
+import nltk
+import numpy as np
+from anthropic import Anthropic
+from nltk.corpus import stopwords
+from nltk.probability import FreqDist
+from nltk.sentiment import SentimentIntensityAnalyzer
+from nltk.tokenize import word_tokenize, sent_tokenize
+from sklearn.decomposition import LatentDirichletAllocation
+from sklearn.feature_extraction.text import TfidfVectorizer
+from textstat import flesch_reading_ease, flesch_kincaid_grade
+
+# Download necessary NLTK data
+nltk.download('punkt', quiet=True)
+nltk.download('stopwords', quiet=True)
+nltk.download('averaged_perceptron_tagger', quiet=True)
+nltk.download('vader_lexicon', quiet=True)
+nltk.download('maxent_ne_chunker', quiet=True)
+nltk.download('words', quiet=True)
+
+ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
+if not ANTHROPIC_API_KEY:
+ raise ValueError("ANTHROPIC_API_KEY environment variable is not set")
+
+anthropic = Anthropic(api_key=ANTHROPIC_API_KEY)
+
+BASE_DIR = 'WHERE/CHATGPT/JSON?STORED' # Update this path
+ANALYSIS_DIR = os.path.join(BASE_DIR, 'analysis_results') # Directory to store analysis results
+
+# Create analysis directory if it doesn't exist
+os.makedirs(ANALYSIS_DIR, exist_ok=True)
+
+
+def preprocess_text(text: str) -> List[str]:
+ """Preprocess text by tokenizing, lowercasing, and removing punctuation and stopwords."""
+ tokens = word_tokenize(text.lower())
+ stop_words = set(stopwords.words('english'))
+ return [token for token in tokens if token not in stop_words and token not in string.punctuation]
+
+
+def extract_named_entities(text: str) -> Dict[str, List[str]]:
+ """Extract named entities from the text."""
+ chunks = nltk.ne_chunk(nltk.pos_tag(word_tokenize(text)))
+ entities = defaultdict(list)
+ for chunk in chunks:
+ if hasattr(chunk, 'label'):
+ entities[chunk.label()].append(' '.join(c[0] for c in chunk))
+ return dict(entities)
+
+
+def calculate_readability_scores(text: str) -> Dict[str, float]:
+ """Calculate readability scores for the text."""
+ return {
+ "flesch_reading_ease": flesch_reading_ease(text),
+ "flesch_kincaid_grade": flesch_kincaid_grade(text)
+ }
+
+
+def analyze_conversation_flow(messages: List[Dict[str, Any]]) -> Dict[str, Any]:
+ """Analyze the flow of the conversation."""
+ turn_lengths = [len(word_tokenize(msg['content'])) for msg in messages]
+ return {
+ "avg_turn_length": np.mean(turn_lengths),
+ "turn_length_variance": np.var(turn_lengths),
+ "conversation_length": len(messages)
+ }
+
+
+def analyze_user_chatgpt_interaction(messages: List[Dict[str, Any]]) -> Dict[str, Any]:
+ """Analyze the interaction between the user and ChatGPT."""
+ user_messages = []
+ chatgpt_messages = []
+
+ for msg in messages:
+ if isinstance(msg, dict):
+ if 'role' in msg:
+ if msg['role'] == 'user':
+ user_messages.append(msg)
+ elif msg['role'] == 'assistant':
+ chatgpt_messages.append(msg)
+ elif 'user' in msg:
+ user_messages.append(msg)
+ elif 'ChatGPT' in msg:
+ chatgpt_messages.append(msg)
+ else:
+ print(f"Unexpected message format: {msg}")
+
+ if not user_messages and not chatgpt_messages:
+ print("Warning: No user or ChatGPT messages found. Check the conversation structure.")
+ return {
+ "error": "Unable to analyze user-ChatGPT interaction due to unexpected conversation structure."
+ }
+
+ total_messages = len(user_messages) + len(chatgpt_messages)
+
+ return {
+ "user_message_count": len(user_messages),
+ "chatgpt_message_count": len(chatgpt_messages),
+ "user_message_ratio": len(user_messages) / total_messages if total_messages > 0 else 0,
+ "avg_user_message_length": np.mean(
+ [len(word_tokenize(msg.get('content', ''))) for msg in user_messages]) if user_messages else 0,
+ "avg_chatgpt_message_length": np.mean(
+ [len(word_tokenize(msg.get('content', ''))) for msg in chatgpt_messages]) if chatgpt_messages else 0,
+ "user_question_count": sum(1 for msg in user_messages if msg.get('content', '').strip().endswith('?')),
+ "chatgpt_code_block_count": sum(1 for msg in chatgpt_messages if '```' in msg.get('content', ''))
+ }
+
+
+def analyze_topic_shifts(messages: List[Dict[str, Any]]) -> List[Tuple[int, List[str]]]:
+ """Conduct a chronological analysis of topic shifts."""
+ vectorizer = TfidfVectorizer(max_features=50, stop_words='english')
+ topic_shifts = []
+
+ for i, msg in enumerate(messages):
+ content = msg.get('content', '')
+ if not content.strip(): # Skip empty messages
+ continue
+ try:
+ tfidf_matrix = vectorizer.fit_transform([content])
+ feature_names = vectorizer.get_feature_names_out()
+ tfidf_scores = tfidf_matrix.toarray()[0]
+ top_topics = [feature_names[i] for i in tfidf_scores.argsort()[::-1][:5]]
+ topic_shifts.append((i, top_topics))
+ except ValueError:
+ # If vocabulary is empty, skip this message
+ continue
+
+ return topic_shifts
+
+
+def perform_co_occurrence_analysis(text: str, window_size: int = 5) -> nx.Graph:
+ """Perform co-occurrence analysis on key terms."""
+ words = preprocess_text(text)
+ co_occurrence = nx.Graph()
+
+ if len(words) < 2:
+ return co_occurrence # Return empty graph if not enough words
+
+ for i in range(len(words)):
+ for j in range(i + 1, min(i + window_size, len(words))):
+ if co_occurrence.has_edge(words[i], words[j]):
+ co_occurrence[words[i]][words[j]]['weight'] += 1
+ else:
+ co_occurrence.add_edge(words[i], words[j], weight=1)
+
+ return co_occurrence
+
+
+def topic_modeling(texts: List[str], num_topics: int = 5) -> Tuple[List[List[Tuple[str, float]]], np.ndarray]:
+ """Use LDA for topic modeling."""
+ if not texts:
+ return [], np.array([])
+
+ vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
+ try:
+ tfidf_matrix = vectorizer.fit_transform(texts)
+ except ValueError:
+ # If vocabulary is empty, return empty results
+ return [], np.array([])
+
+ if tfidf_matrix.shape[1] == 0:
+ return [], np.array([])
+
+ lda = LatentDirichletAllocation(n_components=min(num_topics, tfidf_matrix.shape[1]), random_state=42)
+ lda.fit(tfidf_matrix)
+
+ feature_names = vectorizer.get_feature_names_out()
+ topics = []
+ for topic_idx, topic in enumerate(lda.components_):
+ top_features_ind = topic.argsort()[:-10 - 1:-1]
+ top_features = [(feature_names[i], topic[i]) for i in top_features_ind]
+ topics.append(top_features)
+
+ topic_distribution = lda.transform(tfidf_matrix)
+ return topics, topic_distribution
+
+
+def advanced_sentiment_analysis(text: str) -> Dict[str, float]:
+ """Perform advanced sentiment analysis on the text."""
+ sia = SentimentIntensityAnalyzer()
+ sentiment_scores = sia.polarity_scores(text)
+
+ # Additional analysis for frustration detection
+ frustration_keywords = ['error', 'bug', 'problem', 'issue', 'fail', 'crash', 'doesn\'t work', 'broken']
+ frustration_score = sum(text.lower().count(keyword) for keyword in frustration_keywords) / len(text.split())
+
+ sentiment_scores['frustration'] = frustration_score
+ return sentiment_scores
+
+
+def analyze_conversation_locally(conversation: Dict, mode: str = 'basic') -> Dict[str, Any]:
+ """Perform comprehensive local NLP analysis on the conversation."""
+ try:
+ messages = conversation.get('messages', [])
+ if not messages:
+ return {"error": "No messages found in the conversation."}
+
+ full_content = " ".join([msg.get('content', '') for msg in messages if isinstance(msg, dict)])
+
+ if not full_content.strip():
+ return {"error": "The conversation is empty or contains only whitespace."}
+
+ words = preprocess_text(full_content)
+ sentences = sent_tokenize(full_content)
+
+ if not words:
+ return {"error": "No meaningful words found in the conversation after preprocessing."}
+
+ basic_analysis = {
+ "word_count": len(words),
+ "sentence_count": len(sentences),
+ "top_words": FreqDist(words).most_common(10),
+ "pos_distribution": dict(Counter(tag for word, tag in nltk.pos_tag(words))),
+ "sentiment_analysis": advanced_sentiment_analysis(full_content),
+ "lexical_diversity": len(set(words)) / len(words) if words else 0,
+ "avg_sentence_length": np.mean([len(word_tokenize(sentence)) for sentence in sentences]) if sentences else 0
+ }
+
+ if mode == 'advanced':
+ advanced_analysis = {
+ "topic_shifts": analyze_topic_shifts(messages),
+ "co_occurrence_graph": list(perform_co_occurrence_analysis(full_content).edges(data=True)),
+ "topics": topic_modeling([msg.get('content', '') for msg in messages if isinstance(msg, dict)])[0],
+ "syntactic_complexity": len(words) / len(sentences) if sentences else None,
+ "named_entities": extract_named_entities(full_content),
+ "readability_scores": calculate_readability_scores(full_content),
+ "conversation_flow": analyze_conversation_flow(messages),
+ "user_chatgpt_interaction": analyze_user_chatgpt_interaction(messages)
+ }
+ return {**basic_analysis, **advanced_analysis}
+
+ return basic_analysis
+ except Exception as e:
+ print(f"Error in analyze_conversation_locally: {str(e)}")
+ return {"error": f"An error occurred during analysis: {str(e)}"}
+
+
+def get_file_hash(file_path: str) -> str:
+ """Generate a hash for the file content."""
+ with open(file_path, 'rb') as file:
+ return hashlib.md5(file.read()).hexdigest()
+
+
+def save_analysis_results(file_path: str, local_analysis: Dict, claude_conversation: List[Dict], mode: str):
+ """Save both local and Claude's analysis results."""
+ file_hash = get_file_hash(file_path)
+ analysis_file = os.path.join(ANALYSIS_DIR, f"{file_hash}_analysis.json")
+
+ results = {
+ "original_file": os.path.basename(file_path),
+ "mode": mode,
+ "local_analysis": local_analysis,
+ "claude_conversation": claude_conversation
+ }
+
+ with open(analysis_file, 'w', encoding='utf-8') as f:
+ json.dump(results, f, indent=2)
+
+
+def load_analysis_results(file_path: str) -> Dict:
+ """Load existing analysis results if available."""
+ file_hash = get_file_hash(file_path)
+ analysis_file = os.path.join(ANALYSIS_DIR, f"{file_hash}_analysis.json")
+
+ if os.path.exists(analysis_file):
+ with open(analysis_file, 'r', encoding='utf-8') as f:
+ return json.load(f)
+ return None
+
+
+def extract_and_save_requests(response: str, file_path: str) -> None:
+ """Extract requests for further analysis from Claude's response and save them to a CSV file."""
+ requests = re.findall(r'(.*?)', response, re.DOTALL)
+
+ if not requests:
+ print("No requests for additional analysis found.")
+ return
+
+ csv_file = os.path.join(ANALYSIS_DIR, 'claude_requests.csv')
+ file_exists = os.path.exists(csv_file)
+
+ with open(csv_file, 'a', newline='', encoding='utf-8') as f:
+ writer = csv.writer(f)
+ if not file_exists:
+ writer.writerow(['Conversation', 'Request', 'Category', 'Priority'])
+
+ for req in requests:
+ category, priority = categorize_request(req)
+ writer.writerow([os.path.basename(file_path), req.strip(), category, priority])
+
+ print(f"Saved {len(requests)} requests for further analysis.")
+
+
+def categorize_request(request: str) -> Tuple[str, str]:
+ """Categorize the request and assign a priority."""
+ categories = {
+ 'statistical': ['correlation', 'regression', 'distribution', 'hypothesis', 'test', 'variance'],
+ 'linguistic': ['syntax', 'semantic', 'discourse', 'pragmatic', 'lexical'],
+ 'semantic': ['topic', 'concept', 'entity', 'relation', 'ontology'],
+ 'interaction': ['turn-taking', 'response time', 'engagement', 'clarification'],
+ 'technical': ['code', 'algorithm', 'framework', 'library', 'api']
+ }
+
+ request_lower = request.lower()
+ for category, keywords in categories.items():
+ if any(keyword in request_lower for keyword in keywords):
+ return category, 'high'
+
+ # If no specific category is found, use 'general' with medium priority
+ return 'general', 'medium'
+
+
+def format_analysis_summary(analysis: Dict, mode: str) -> str:
+ """Format the analysis summary for Claude's input."""
+ summary = f"""
+ Analysis Mode: {mode}
+ Word Count: {analysis['word_count']}
+ Sentence Count: {analysis['sentence_count']}
+ Top Words: {', '.join([f"{word}({count})" for word, count in analysis['top_words']])}
+ Parts of Speech Distribution: {analysis['pos_distribution']}
+ Lexical Diversity: {analysis['lexical_diversity']:.2f}
+ Average Sentence Length: {analysis['avg_sentence_length']:.2f}
+ Sentiment Analysis:
+ {json.dumps(analysis['sentiment_analysis'], indent=2)}
+ """
+
+ if mode == 'advanced':
+ summary += f"""
+ Top 3 Topics:
+ {json.dumps(analysis['topics'][:3], indent=2)}
+ Top 10 Co-occurrences:
+ {json.dumps(analysis['co_occurrence_graph'][:10], indent=2)}
+ Named Entities:
+ {json.dumps(analysis['named_entities'], indent=2)}
+ Readability Scores:
+ {json.dumps(analysis['readability_scores'], indent=2)}
+ Conversation Flow:
+ {json.dumps(analysis['conversation_flow'], indent=2)}
+ User-ChatGPT Interaction:
+ {json.dumps(analysis['user_chatgpt_interaction'], indent=2)}
+ """
+
+ return summary
+
+
+def analyze_conversation(file_path: str, conversation: Dict, mode: str = 'basic') -> None:
+ """Perform deep analysis of the conversation using local NLP and Claude."""
+ existing_analysis = load_analysis_results(file_path)
+
+ if existing_analysis:
+ if 'mode' in existing_analysis and existing_analysis['mode'] == mode:
+ print("Loading existing analysis results...")
+ local_analysis = existing_analysis.get('local_analysis', {})
+ claude_conversation = existing_analysis.get('claude_conversation', [])
+ else:
+ print(f"Existing analysis found, but mode doesn't match. Performing new {mode} analysis...")
+ local_analysis = analyze_conversation_locally(conversation, mode)
+ claude_conversation = []
+ else:
+ print(f"Performing new {mode} analysis...")
+ local_analysis = analyze_conversation_locally(conversation, mode)
+ claude_conversation = []
+
+ if "error" in local_analysis:
+ print(f"Error in local analysis: {local_analysis['error']}")
+ return
+
+ if not claude_conversation:
+ analysis_summary = format_analysis_summary(local_analysis, mode)
+
+ initial_prompt = f"""
+ Based on the following {mode} NLP analysis of a technical conversation between a user and ChatGPT, please provide deep insights and interpretations:
+
+ {analysis_summary}
+
+ Please consider:
+ 1. How do the topic shifts and conversation flow reflect the structure of the technical discussion?
+ 2. What insights can we draw from the co-occurrence analysis and named entities about the relationships between technical concepts?
+ 3. How does the sentiment analysis and user-ChatGPT interaction metrics reflect the participants' experiences with the technical content?
+ 4. What hidden themes or concepts are revealed by the topic modeling that might not be immediately apparent?
+ 5. How do the readability scores, syntactic complexity, and lexical diversity reflect the level of expertise in the conversation?
+ 6. Based on this analysis, what can we infer about the nature, depth, and context of this technical discussion?
+ 7. Are there any patterns or anomalies in the data that warrant further investigation?
+ 8. How does the conversation demonstrate the capabilities and limitations of ChatGPT in handling technical discussions?
+ 9. What recommendations would you make for improving the quality of such technical conversations between users and AI assistants?
+
+ Provide a detailed interpretation of these results, highlighting key insights and potential areas for further analysis. Feel free to use tags to simulate your thought process.
+
+ """
+ claude_conversation.append({"role": "user", "content": initial_prompt})
+
+ while True:
+ if claude_conversation:
+ try:
+ message = anthropic.messages.create(
+ model="claude-3-5-sonnet-20240620",
+ max_tokens=4000,
+ messages=claude_conversation
+ )
+ response = message.content[0].text
+ claude_conversation.append({"role": "assistant", "content": response})
+ print("\nClaude's Analysis:")
+ print(response)
+
+ # Extract and save requests for further analysis
+ extract_and_save_requests(response, file_path)
+ except Exception as e:
+ print(f"Error in getting Claude's response: {e}")
+ break
+
+ user_input = input("\nEnter your next question (or '/bye' to return to the main menu): ")
+ if user_input.lower() == '/bye':
+ print("Returning to the main menu...")
+ break
+
+ claude_conversation.append({"role": "user", "content": user_input})
+ save_analysis_results(file_path, local_analysis, claude_conversation, mode)
+
+ # Final save before returning to the main menu
+ save_analysis_results(file_path, local_analysis, claude_conversation, mode)
+
+
+def load_conversations(directory: str) -> Dict[str, str]:
+ """Load conversations from JSON files in the specified directory."""
+ conversations = {}
+ for i, filename in enumerate(os.listdir(directory), 1):
+ if filename.endswith('.json'):
+ file_path = os.path.join(directory, filename)
+ conversations[str(i)] = file_path
+ return conversations
+
+
+def load_conversation_content(file_path: str) -> Dict:
+ """Load the content of a single conversation from a JSON file."""
+ with open(file_path, 'r', encoding='utf-8') as file:
+ return json.load(file)
+
+
+def list_saved_analyses():
+ """List all saved analyses."""
+ analyses = []
+ for filename in os.listdir(ANALYSIS_DIR):
+ if filename.endswith('_analysis.json'):
+ with open(os.path.join(ANALYSIS_DIR, filename), 'r') as f:
+ data = json.load(f)
+ analyses.append(data['original_file'])
+ return analyses
+
+
+def main():
+ while True:
+ print("\n1. Analyze a new conversation")
+ print("2. Load a previous analysis")
+ print("3. Exit")
+ choice = input("Enter your choice (1/2/3): ")
+
+ if choice == '1':
+ conversations = load_conversations(BASE_DIR)
+ print("\nAvailable conversations:")
+ for num, file_path in conversations.items():
+ print(f"{num}: {os.path.basename(file_path)}")
+
+ conv_choice = input("\nEnter the number of the conversation you want to analyze: ")
+ if conv_choice in conversations:
+ file_path = conversations[conv_choice]
+ conversation = load_conversation_content(file_path)
+
+ analysis_mode = input("Choose analysis mode (basic/advanced): ").lower()
+ if analysis_mode not in ['basic', 'advanced']:
+ print("Invalid mode. Defaulting to basic analysis.")
+ analysis_mode = 'basic'
+
+ analyze_conversation(file_path, conversation, analysis_mode)
+ else:
+ print("Invalid choice. Please try again.")
+
+ elif choice == '2':
+ saved_analyses = list_saved_analyses()
+ if not saved_analyses:
+ print("No saved analyses found.")
+ continue
+
+ print("\nSaved analyses:")
+ for i, filename in enumerate(saved_analyses, 1):
+ print(f"{i}: {filename}")
+
+ analysis_choice = input("\nEnter the number of the analysis you want to load: ")
+ try:
+ file_to_load = saved_analyses[int(analysis_choice) - 1]
+ file_path = os.path.join(BASE_DIR, file_to_load)
+ conversation = load_conversation_content(file_path)
+ existing_analysis = load_analysis_results(file_path)
+ mode = existing_analysis.get('mode', 'basic') if existing_analysis else 'basic'
+ analyze_conversation(file_path, conversation, mode)
+ except (ValueError, IndexError):
+ print("Invalid choice. Please try again.")
+
+ elif choice == '3':
+ print("Goodbye!")
+ break
+
+ else:
+ print("Invalid choice. Please try again.")
+
+
+if __name__ == "__main__":
+ main()