Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix/ingest nested folders #13

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
61 changes: 35 additions & 26 deletions frontend/pages/index.tsx
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"use client"; // This is a client component
"use client";

import React, { useEffect, useState, useContext } from 'react';
import React, { useEffect, useState } from 'react';
import PieChart from '../components/PieChart';
import { getGateUrl } from '../utils/getGateUrl';
import { fetchUser, fetchReadItemsByAttribute, fetchItems } from '../utils/api'
import { fetchUser, fetchReadItemsByAttribute, fetchItems } from '../utils/api';

interface Result {
answer: string;
Expand All @@ -21,13 +21,26 @@ interface Criterion {
gate: string;
}

// Map of gate identifiers to their human-readable format
const gateMap: { [key: string]: string } = {
'GATE_0': 'Gate 0',
'GATE_1': 'Gate 1',
'GATE_2': 'Gate 2',
'GATE_3': 'Gate 3',
'GATE_4': 'Gate 4',
'GATE_5': 'Gate 5'
};

const getBaseName = (name: string): string => name.split('-')[0];

const Summary: React.FC = () => {
const [chartData, setChartData] = useState<number[]>([]);
const [chartLabels, setChartLabels] = useState<string[]>([]);
const [summaryText, setSummaryText] = useState<string>('');
const [gateUrl, setGateUrl] = useState<string | null>(null);
const [categories, setCategories] = useState<{ [key: string]: number }>({});
const [projectDetails, setProjectDetails] = useState<any>(null);
const [reviewType, setReviewType] = useState<string>('');

useEffect(() => {
const fetchData = async () => {
Expand All @@ -44,6 +57,15 @@ const Summary: React.FC = () => {
};

const criteria = await Promise.all(results.map(fetchCriteria));

// Calculate review type based on unique gates
const uniqueGates = new Set(criteria.map(criterion => criterion.gate));
const formattedGates = Array.from(uniqueGates)
.map(gate => gateMap[gate] || gate)
.sort()
.join(', ');
setReviewType(formattedGates);

const fetchedCategories = criteria.map(criterion => criterion.category);
console.log('Criterion fetched:', fetchedCategories);

Expand Down Expand Up @@ -88,43 +110,30 @@ const Summary: React.FC = () => {
<br />
<div className="summary-card" style={{ display: 'flex', alignItems: 'center', marginBottom: '20px', marginTop: '40px' }}>
<div style={{ flex: 1 }}>
<h2>Welcome to <strong>Scout!</strong></h2>
<p>
{gateUrl && (
<>
This AI tool helps you navigate your document set before your review. Please check the details below are correct before continuing
<ul>
<strong>Review Type:</strong> {projectDetails.review_type} <br />
<strong>Project Name:</strong> {projectDetails.name}
</ul>
This tool has preprocessed your documents and analysed them against the questions in the
<a href={gateUrl} target="_blank" rel="noopener noreferrer">
{projectDetails.review_type} workbook
</a>.
</>
)}
Scout helps you navigate your document set before your review. Please check the details below are correct before continuing.
<ul>
Project Name:<strong> {projectDetails.name ? getBaseName(projectDetails.name) : projectDetails.name}</strong><br />
Review Type:<strong> {reviewType} </strong>

</ul>
This tool has preprocessed your documents and analysed them against the questions in the {reviewType} workbook. Head to the Results tab to see analysis for each criterion in the gate workbook, click on the links to files to see what evidence has been used to form that conclusion.
</p>
</div>
</div>
</div>

<div className="summary-card" style={{ display: 'flex', alignItems: 'top', marginBottom: '20px' }}>
<div style={{ flex: 1 }}>
<h2>Review Summary</h2>
<p>{summaryText}</p>
<h2>Overview</h2>
<div dangerouslySetInnerHTML={{ __html: summaryText }} />
</div>
<div className="chart-container" style={{ flex: 1 }}>
<PieChart data={chartData} labels={chartLabels} />
</div>
</div>
{Object.keys(categories).map(category => (
<div className="summary-card" key={category} style={{ marginBottom: '20px' }}>
<h2>{category}</h2>
<p>{`Number of negative results: ${categories[category]}`}</p>
</div>
))}
</div>
);
};

export default Summary;
export default Summary;
57 changes: 36 additions & 21 deletions libreoffice_service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,42 @@ class HealthResponse(BaseModel):
status: bool


def transform_file_path(input_path: str) -> str:
# Split the path into directory and filename
directory, filename = os.path.split(input_path)

# Split the directory into parts
dir_parts = directory.split(os.sep)

# Remove the last directory and replace it with "processed"
if len(dir_parts) > 1:
dir_parts[-1] = "processed"
else:
dir_parts.append("processed")

# Join the directory parts back together
new_directory = os.sep.join(dir_parts)

# Get the filename without extension and add .pdf
new_filename = os.path.splitext(filename)[0] + ".pdf"

# Join the new directory and filename
return os.path.join(new_directory, new_filename)
def transform_file_path(input_path):
"""
Transform a file path by replacing 'raw' directory with 'processed' and changing extension to .pdf
while maintaining the rest of the directory structure.

Args:
input_path (str): Original file path (e.g., 'project/raw/subfolder/file.docx')

Returns:
str: Transformed file path (e.g., 'project/processed/subfolder/file.pdf')

Example:
>>> transform_file_path('example_project/raw/test_folder/document.docx')
'example_project/processed/test_folder/document.pdf'
"""
# Normalize path separators for the current OS
input_path = os.path.normpath(input_path)

# Split the path into parts
parts = input_path.split(os.sep)

# Find the 'raw' directory index
try:
raw_index = parts.index('raw')
except ValueError:
raise ValueError("Input path must contain a 'raw' directory")

# Replace 'raw' with 'processed'
parts[raw_index] = 'processed'

# Get the filename and change extension to .pdf
filename = os.path.splitext(parts[-1])[0] + '.pdf'
parts[-1] = filename

# Join all parts back together
return os.sep.join(parts)


@app.get("/health")
Expand Down
24 changes: 15 additions & 9 deletions scout/LLMFlag/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from openai import APIConnectionError, APIError, OpenAI, RateLimitError
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential

from scout.DataIngest.models.schemas import Chunk, CriterionCreate, File, ProjectCreate, ResultCreate
from scout.DataIngest.models.schemas import Chunk, CriterionCreate, File, ProjectCreate,ProjectUpdate, ResultCreate
from scout.LLMFlag.prompts import (
CORE_SCOUT_PERSONA,
DOCUMENT_EXTRACT_PROMPT,
Expand All @@ -19,6 +19,7 @@
USER_EVIDENCE_POINTS_PROMPT,
USER_QUESTION_PROMPT,
USER_REGENERATE_HYPOTHESIS_PROMPT,
SUMMARIZE_RESPONSES_PROMPT
)
from scout.LLMFlag.retriever import ReRankRetriever
from scout.utils.storage.storage_handler import BaseStorageHandler
Expand Down Expand Up @@ -49,6 +50,7 @@ class BaseEvaluator(ABC):
def __init__(self):
"""Initialise the evaluator"""
self.hypotheses = "None"
self.temp=0

@abstractmethod
def evaluate_question(self, criteria_uuid: str) -> List[str]:
Expand Down Expand Up @@ -112,6 +114,7 @@ def answer_question(
)
evidence_response = api_call_with_retry(
self.llm.chat.completions.create,
temperature=self.temp,
model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"),
messages=[
{
Expand Down Expand Up @@ -139,6 +142,7 @@ def answer_question(

question_response = api_call_with_retry(
self.llm.chat.completions.create,
temperature=self.temp,
model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"),
messages=[
{"role": "system", "content": SYSTEM_QUESTION_PROMPT},
Expand All @@ -161,6 +165,7 @@ def answer_question(

hypotheses_response = api_call_with_retry(
self.llm.chat.completions.create,
temperature=0.5,
model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"),
messages=[
{"role": "system", "content": CORE_SCOUT_PERSONA},
Expand Down Expand Up @@ -199,7 +204,7 @@ def __init__(
storage_handler: BaseStorageHandler,
):
"""Initialise the evaluator"""
self.hypotheses = "None"
super().__init__()
self.vector_store = vector_store
self.llm = llm
self.storage_handler = storage_handler
Expand Down Expand Up @@ -237,23 +242,23 @@ def evaluate_questions(self, criteria: List[CriterionCreate], k: int = 3, save:
logger.info("Generating summary of answers...")
# Generate summary of answers
summary = self.generate_summary(question_answer_pairs)
self.project.results_summary = summary
self.storage_handler.update_item(self.project)
project_update=ProjectUpdate(id=self.project.id,name=self.project.name, results_summary=summary)
self.storage_handler.update_item(project_update)
return results

def generate_summary(self, question_answer_pairs: List[tuple]) -> str:
"""Generate a summary of the answers using an LLM, with an input prompt containing instructions."""

SUMMARIZE_RESPONSES_PROMPT = """You are a project delivery expert, you will be given question and answer pairs about a government project. Return a summary of the most important themes, you do not need to summarise all the questions, only return important, specific information. Be specific about project detail referred to. Return no more than 3 sentences. {qa_pairs}"""


formatted_input = ", ".join([f"Question: {qa[0]}\nAnswer: {qa[1]}" for qa in question_answer_pairs])
response = api_call_with_retry(
self.llm.chat.completions.create,
temperature=self.temp,
model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"),
messages=[
{
"role": "user",
"content": SUMMARIZE_RESPONSES_PROMPT.format(qa_pairs=formatted_input),
"content": SUMMARIZE_RESPONSES_PROMPT.format(qa_pairs=formatted_input,hypotheses=self.hypotheses),
},
],
)
Expand All @@ -270,13 +275,14 @@ def model(criterion: CriterionCreate, k: int = 3):
k=k,
)

extracted_words = re.findall(r"\b(positive|neutral|negative)\b", full_text, re.IGNORECASE)
# find any sentiment words that might be in square brackets
extracted_words = re.findall(r'\[?(positive|neutral|negative)\]?', full_text, re.IGNORECASE)

if extracted_words:
answer = extracted_words[-1].title()
# Remove the key words and brackets from full_text
full_text = re.sub(
r"\b(positive|neutral|negative)\b",
r'\[?(positive|neutral|negative)\]?',
"",
full_text,
flags=re.IGNORECASE,
Expand Down
38 changes: 30 additions & 8 deletions scout/LLMFlag/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,17 +161,38 @@


#
# For section summaries
# For summaries
#
_summarise_core_prompt = """
You are an an expert project delivery adviser from the UK Government's Infrastructure and Projects Authority. \
You are given a list of questions that flag problems related to a project. \
Return a summary of the overall themes the questions are trying uncover. \
The summary must be at most 3 sentences long. \
Questions:
{questions}

_summarise_core_prompt = """
<start of task>
You will be given question and answer pairs about a government project. Return a summary of the most important themes and some lines of enquiry.
<end of task>
<start of rules>
You do not need to summarise all the questions, only return important, specific information.
Be specific about project detail referred to.
Return no more than 5 sentences.
Recommend at most 3 lines of enquiry at the end, and explicitly call them a suggested lines of enquiry.
Lines of enquiry are points to investigate further in interviews with the project team.
To form your lines of lines of enquiry, use the hypotheses formed during the investigation.
Use simple html text formatting tags, <strong> and <br /> for bold and line breaks between items and sections of your response.
There must be a line break before the suggested lines of enquiry section.
Do not use Markdown formatting.
Do not use any extra formatting for font type size or color or anything else.
Do not include a title in your answer.
<end of rules>
<start of hypotheses>
{hypotheses}
<end of hypotheses>
<start of question and answer paris>
{qa_pairs}
<end of question and answer paris>
"""

SUMMARIZE_RESPONSES_PROMPT=CORE_SCOUT_PERSONA+_summarise_core_prompt



SUMMARISE_OUTPUTS_PROMPT = PromptTemplate.from_template(_summarise_core_prompt)


Expand All @@ -188,3 +209,4 @@
Extract:{text}
=======
"""

2 changes: 1 addition & 1 deletion scout/Pipelines/ingest_project_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def ingest_project_files(
# Upload files to s3
s3_file_keys = s3_storage_handler.upload_folder_contents(
str(project_folder_path),
recursive=False,
recursive=True,
prefix=sanitise_project_name(project.name) + "/raw/",
)
logger.info(f"Uploaded {s3_file_keys} files to s3")
Expand Down
Loading
Loading