Spaces:

rohan13
/

canvas-discussion-grader-with-feedback

Runtime error

App Files Files Community

canvas-discussion-grader-with-feedback / custom_csv_loader.py

rohan13

Custom csv loader

0cad0b3 about 1 year ago

raw

history blame

No virus

4.24 kB

	import csv
	from typing import Any, Dict, List, Optional

	from langchain.docstore.document import Document
	from langchain.document_loaders.base import BaseLoader
	from langchain.document_loaders.unstructured import (
	UnstructuredFileLoader,
	validate_unstructured_version,
	)


	class CSVLoader(BaseLoader):
	"""Loads a CSV file into a list of documents.

	Each document represents one row of the CSV file. Every row is converted into a
	key/value pair and outputted to a new line in the document's page_content.

	The source for each document loaded from csv is set to the value of the
	`file_path` argument for all documents by default.
	You can override this by setting the `source_column` argument to the
	name of a column in the CSV file.
	The source of each document will then be set to the value of the column
	with the name specified in `source_column`.

	Output Example:
	.. code-block:: txt

	column1: value1
	column2: value2
	column3: value3
	"""

	def __init__(
	self,
	file_path: str,
	source_column: Optional[str] = None,
	csv_args: Optional[Dict] = None,
	encoding: Optional[str] = None,
	):
	"""

	Args:
	file_path: The path to the CSV file.
	source_column: The name of the column in the CSV file to use as the source.
	Optional. Defaults to None.
	csv_args: A dictionary of arguments to pass to the csv.DictReader.
	Optional. Defaults to None.
	encoding: The encoding of the CSV file. Optional. Defaults to None.
	"""
	self.file_path = file_path
	self.source_column = source_column
	self.encoding = encoding
	self.csv_args = csv_args or {}

	def load(self) -> List[Document]:
	"""Load data into document objects."""

	docs = []
	with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
	csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
	for i, row in enumerate(csv_reader):
	content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items() if k != 'restkey')
	try:
	source = (
	row[self.source_column]
	if self.source_column is not None
	else self.file_path
	)
	except KeyError:
	raise ValueError(
	f"Source column '{self.source_column}' not found in CSV file."
	)
	metadata = {"source": source, "row": i}
	doc = Document(page_content=content, metadata=metadata)
	docs.append(doc)

	return docs


	class UnstructuredCSVLoader(UnstructuredFileLoader):
	"""Loader that uses unstructured to load CSV files. Like other
	Unstructured loaders, UnstructuredCSVLoader can be used in both
	"single" and "elements" mode. If you use the loader in "elements"
	mode, the CSV file will be a single Unstructured Table element.
	If you use the loader in "elements" mode, an HTML representation
	of the table will be available in the "text_as_html" key in the
	document metadata.

	Examples
	--------
	from langchain.document_loaders.csv_loader import UnstructuredCSVLoader

	loader = UnstructuredCSVLoader("stanley-cups.csv", mode="elements")
	docs = loader.load()
	"""

	def __init__(
	self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
	):
	"""

	Args:
	file_path: The path to the CSV file.
	mode: The mode to use when loading the CSV file.
	Optional. Defaults to "single".
	**unstructured_kwargs: Keyword arguments to pass to unstructured.
	"""
	validate_unstructured_version(min_unstructured_version="0.6.8")
	super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

	def _get_elements(self) -> List:
	from unstructured.partition.csv import partition_csv

	return partition_csv(filename=self.file_path, **self.unstructured_kwargs)