Carlos Carrero
1 min readFeb 2, 2024

--

Hi Sushil. Thank you for trying it. There will be a quisktart on this pretty soon. Meanwhile You can try this:

create or replace function pdf_text_chunker(file_url string)

returns table (chunk varchar)

language python

runtime_version = '3.9'

handler = 'pdf_text_chunker'

packages = ('snowflake-snowpark-python','PyPDF2', 'langchain')

as

$$

from snowflake.snowpark.types import StringType, StructField, StructType

from langchain.text_splitter import RecursiveCharacterTextSplitter

from snowflake.snowpark.files import SnowflakeFile

import PyPDF2, io

import logging

import pandas as pd

class pdf_text_chunker:

def read_pdf(self, file_url: str) -> str:

logger = logging.getLogger("udf_logger")

logger.info(f"Opening file {file_url}")

with SnowflakeFile.open(file_url, 'rb') as f:

buffer = io.BytesIO(f.readall())

reader = PyPDF2.PdfReader(buffer)

text = ""

for page in reader.pages:

try:

text += page.extract_text().replace('\n', ' ').replace('\0', ' ')

except:

text = "Unable to Extract"

logger.warn(f"Unable to extract from file {file_url}, page {page}")

return text

def process(self,file_url: str):

text = self.read_pdf(file_url)

text_splitter = RecursiveCharacterTextSplitter(

chunk_size = 4000, #Adjust this as you see fit

chunk_overlap = 300, #This let's text have some form of overlap. Useful for keeping chunks contextual

length_function = len

)

chunks = text_splitter.split_text(text)

df = pd.DataFrame(chunks, columns=['chunks'])

yield from df.itertuples(index=False, name=None)

$$

--

--

Carlos Carrero
Carlos Carrero

Written by Carlos Carrero

Global Principal Architect - AI/ML, Partners @Snowflake

Responses (1)