import streamlit as st from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForSequenceClassification import torch import fitz # PyMuPDF from docx import Document from openai import OpenAI # Load models and tokenizers summarizer_model_name = "facebook/bart-large-cnn" summarizer_tokenizer = AutoTokenizer.from_pretrained(summarizer_model_name) summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(summarizer_model_name) classifier_model_name = "nlpaueb/legal-bert-base-uncased" classifier = pipeline('text-classification', model=classifier_model_name) predictive_model_name = "nlpaueb/legal-bert-base-uncased" predictive_model = AutoModelForSequenceClassification.from_pretrained(predictive_model_name) predictive_tokenizer = AutoTokenizer.from_pretrained(predictive_model_name) # Function to extract text from PDF def extract_text_from_pdf(pdf_file): pdf_reader = fitz.open(stream=pdf_file.read(), filetype="pdf") text = "" for page_num in range(len(pdf_reader)): page = pdf_reader.load_page(page_num) text += page.get_text() return text # Function to extract text from DOCX def extract_text_from_docx(docx_file): doc = Document(docx_file) text = "\n".join([para.text for para in doc.paragraphs]) return text # Function to chunk text def chunk_text(text, tokenizer, chunk_size=512): tokens = tokenizer.encode(text, add_special_tokens=False) return [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)] # Streamlit Interface st.title("Legal Assistant AI") st.header("Upload Legal Document") uploaded_file = st.file_uploader("Upload a Word document or PDF", type=["pdf", "docx"]) openai_key = st.text_input("Enter your OpenAI API key", type="password") if uploaded_file is not None: if uploaded_file.type == "application/pdf": document_text = extract_text_from_pdf(uploaded_file) elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": document_text = extract_text_from_docx(uploaded_file) st.header("Input Legal Document") st.text_area("Document Content", document_text, height=300) if st.button("Analyze Document"): if openai_key: if document_text: # Summarization st.subheader("Summary of the Document") chunks = chunk_text(document_text, summarizer_tokenizer) summaries = [] for chunk in chunks: inputs = summarizer_tokenizer.encode("summarize: " + summarizer_tokenizer.decode(chunk), return_tensors="pt", max_length=1024, truncation=True) summary_ids = summarizer_model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True) summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True) summaries.append(summary) full_summary = " ".join(summaries) st.write(full_summary) # Classification st.subheader("Classification of the Document") classification_result = [] for chunk in chunks: chunk_text = summarizer_tokenizer.decode(chunk) classification_result.append(classifier(chunk_text)) st.write(classification_result) # Predictive Analysis st.subheader("Predictive Analysis") all_predictions = [] for chunk in chunks: chunk_text = summarizer_tokenizer.decode(chunk) inputs = predictive_tokenizer(chunk_text, return_tensors="pt", truncation=True, padding='max_length', max_length=512) with torch.no_grad(): outputs = predictive_model(**inputs) predictions = torch.softmax(outputs.logits, dim=-1) all_predictions.append(predictions) avg_predictions = torch.mean(torch.stack(all_predictions), dim=0) # Decision Making (using OpenAI GPT-4) st.subheader("Judge's Decision and Reasoning") # Prepare analysis data for OpenAI GPT-4 analysis_data = { "summary": full_summary, "classification_result": classification_result, "average_predictions": avg_predictions.tolist() # Convert tensor to list for JSON serialization } client = OpenAI(api_key=openai_key) decision_prompt = f""" Analyze the following legal document analysis data and provide a decision and reasoning: Summary of the Document: {full_summary} Classification Result: {classification_result} Average Predictions: {avg_predictions.tolist()} Based on the above analysis, provide the judge's decision and reasoning in a detailed manner. """ response = client.chat.completions.create( model="gpt-4", messages=[ {"role": "system", "content": "You are a legal expert analyzing the case."}, {"role": "user", "content": decision_prompt} ], max_tokens=500 ) decision_and_reasoning = response.choices[0].message.content.strip() st.write(decision_and_reasoning) else: st.write("Please input a legal document for analysis.") else: st.write("Please enter your OpenAI API key.")