Load RDS from S3 parquet files using lambda function

Pathan

5 min readOct 12, 2023

Below concepts will be discussed

Prerequisites
Role permissions
Lambda code
Utilizing secret manager(best practice)
Batch processing
Test Lambda function
Creating test event
Trigger lambda using API call

Loading files from S3 to RDS SQL server can be tough, if you have very large files and lambda will never be a solution.

This solution works when the file sizes are small.

Prerequisites:

AWS account
S3 bucket
RDS (SQL server)
some parquet files to load

First of all, let’s set the AWS permissions

You need to have below permissions to the role you are using

Secret manager to manage the RDS password

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor",
            "Effect": "Allow",
            "Action": [
                "secretsmanager:DescribeSecret",
                "secretsmanager:PutSecretValue",
                "secretsmanager:CreateSecret",
                "secretsmanager:DeleteSecret",
                "secretsmanager:CancelRotateSecret",
                "secretsmanager:ListSecretVersionIds",
                "secretsmanager:UpdateSecret",
                "secretsmanager:GetRandomPassword",
                "secretsmanager:GetResourcePolicy",
                "secretsmanager:GetSecretValue",
                "secretsmanager:StopReplicationToReplica",
                "secretsmanager:ReplicateSecretToRegions",
                "secretsmanager:RestoreSecret",
                "secretsmanager:RotateSecret",
                "secretsmanager:UpdateSecretVersionStage",
                "secretsmanager:RemoveRegionsFromReplication",
                "secretsmanager:ListSecrets"
            ],
            "Resource": "*"
        }
    ]
}

S3 bucket.

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor",
            "Effect": "Allow",
            "Action": [
                "s3:ListAccessPointsForObjectLambda",
                "s3:DeleteAccessPoint",
                "s3:DeleteAccessPointForObjectLambda",
                "s3:PutLifecycleConfiguration",
                "s3:DeleteObject",
                "s3:CreateMultiRegionAccessPoint",
                "s3:GetBucketWebsite",
                "s3:GetMultiRegionAccessPoint",
                "s3:PutReplicationConfiguration",
                "s3:GetObjectAttributes",
                "s3:InitiateReplication",
                "s3:GetObjectLegalHold",
                "s3:GetBucketNotification",
                "s3:GetReplicationConfiguration",
                "s3:DescribeMultiRegionAccessPointOperation",
                "s3:PutObject",
                "s3:PutBucketNotification",
                "s3:CreateJob",
                "s3:PutBucketObjectLockConfiguration",
                "s3:GetStorageLensDashboard",
                "s3:GetLifecycleConfiguration",
                "s3:GetBucketTagging",
                "s3:GetInventoryConfiguration",
                "s3:GetAccessPointPolicyForObjectLambda",
                "s3:ListBucket",
                "s3:AbortMultipartUpload",
                "s3:UpdateJobPriority",
                "s3:PutBucketVersioning",
                "s3:GetMultiRegionAccessPointPolicyStatus",
                "s3:ListBucketMultipartUploads",
                "s3:PutIntelligentTieringConfiguration",
                "s3:PutMetricsConfiguration",
                "s3:GetBucketVersioning",
                "s3:GetAccessPointConfigurationForObjectLambda",
                "s3:PutInventoryConfiguration",
                "s3:GetMultiRegionAccessPointRoutes",
                "s3:GetStorageLensConfiguration",
                "s3:DeleteStorageLensConfiguration",
                "s3:GetAccountPublicAccessBlock",
                "s3:PutBucketWebsite",
                "s3:ListAllMyBuckets",
                "s3:PutBucketRequestPayment",
                "s3:PutObjectRetention",
                "s3:CreateAccessPointForObjectLambda",
                "s3:GetBucketCORS",
                "s3:GetObjectVersion",
                "s3:PutAnalyticsConfiguration",
                "s3:PutAccessPointConfigurationForObjectLambda",
                "s3:GetObjectVersionTagging",
                "s3:PutStorageLensConfiguration",
                "s3:GetStorageLensConfigurationTagging",
                "s3:ReplicateObject",
                "s3:GetObjectAcl",
                "s3:GetBucketObjectLockConfiguration",
                "s3:DeleteBucketWebsite",
                "s3:GetIntelligentTieringConfiguration",
                "s3:GetObjectVersionAcl",
                "s3:GetBucketPolicyStatus",
                "s3:GetObjectRetention",
                "s3:GetJobTagging",
                "s3:ListJobs",
                "s3:PutObjectLegalHold",
                "s3:PutBucketCORS",
                "s3:ListMultipartUploadParts",
                "s3:GetObject",
                "s3:DescribeJob",
                "s3:PutBucketLogging",
                "s3:GetAnalyticsConfiguration",
                "s3:GetObjectVersionForReplication",
                "s3:GetAccessPointForObjectLambda",
                "s3:CreateAccessPoint",
                "s3:GetAccessPoint",
                "s3:PutAccelerateConfiguration",
                "s3:SubmitMultiRegionAccessPointRoutes",
                "s3:DeleteObjectVersion",
                "s3:GetBucketLogging",
                "s3:ListBucketVersions",
                "s3:RestoreObject",
                "s3:GetAccelerateConfiguration",
                "s3:GetObjectVersionAttributes",
                "s3:GetBucketPolicy",
                "s3:PutEncryptionConfiguration",
                "s3:GetEncryptionConfiguration",
                "s3:GetObjectVersionTorrent",
                "s3:GetBucketRequestPayment",
                "s3:GetAccessPointPolicyStatus",
                "s3:GetObjectTagging",
                "s3:GetBucketOwnershipControls",
                "s3:GetMetricsConfiguration",
                "s3:GetBucketPublicAccessBlock",
                "s3:GetMultiRegionAccessPointPolicy",
                "s3:GetAccessPointPolicyStatusForObjectLambda",
                "s3:ListAccessPoints",
                "s3:PutBucketOwnershipControls",
                "s3:DeleteMultiRegionAccessPoint",
                "s3:ListMultiRegionAccessPoints",
                "s3:UpdateJobStatus",
                "s3:GetBucketAcl",
                "s3:ListStorageLensConfigurations",
                "s3:GetObjectTorrent",
                "s3:GetBucketLocation",
                "s3:GetAccessPointPolicy",
                "s3:ReplicateDelete"
            ],
            "Resource": "*"
        }
    ]
}

Cloudwatch logs

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor",
            "Effect": "Allow",
            "Action": [
                "logs:DescribeQueries",
                "logs:CreateLogStream",
                "logs:GetLogRecord",
                "logs:GetQueryResults",
                "logs:DescribeLogStreams",
                "logs:StartQuery",
                "logs:GetLogEvents",
                "logs:StopQuery",
                "logs:GetLogGroupFields",
                "logs:DescribeQueryDefinitions",
                "logs:CreateLogGroup",
                "logs:PutLogEvents"
            ],
            "Resource": "*"
        }
    ]
}

Lambda

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": "logs:CreateLogGroup",
            "Resource": "ARN here:*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "logs:CreateLogStream",
                "logs:PutLogEvents"
            ],
            "Resource": [
                "ARN:log-group:log group here:*"
            ]
        }
    ]
}

Now that permissions are setup, we need to load the files into S3.

Once the files are uploaded. Let’s create a lambda function

import boto3
import pandas as pd
import pyarrow.parquet as pq
from io import BytesIO
from sqlalchemy import create_engine
import boto3
import os
import pyodbc 
import json
import awswrangler as wr
from botocore.exceptions import ClientError

def get_secret():
    # we will create the connectionString below and use it in the process
    global connectionString 
    server=os.environ.get('Server') # get the details from environment variables
    db=os.environ.get('db') # get the details from environment variables
    secret_name =os.environ.get('secret_name') # get the details from environment variables
    region_name =os.environ.get('region_name') # get the details from environment variables

    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager', #don't change this
        region_name=region_name
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        raise e

    response=json.loads(get_secret_value_response['SecretString'])
    # Decrypts secret using the associated KMS key.
    username = response.get('username') #change this variable according to the value you provided during secret creation
    password = response.get('password')
    connectionString= f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={db};UID={username};PWD={password};Encrypt=yes;TrustServerCertificate=yes;Connection Timeout=300;'
 
 
def s3_to_df(bucket,keys):
    s3Client = boto3.client('s3')
    file_objects = s3Client.list_objects_v2(Bucket=bucket, Prefix=keys)['Contents']
    for file_object in file_objects:
        file_key = file_object['Key']
        print(file_key)
        file_obj = s3Client.get_object(Bucket=bucket, Key=file_key)
        parquet_file = pq.ParquetFile(BytesIO(file_obj['Body'].read()))
        df = parquet_file.read().to_pandas()
    return df


def lambda_handler(event, context):
    #get our bucket and file name
    db=os.environ.get('db')
    bucket = event['Records'][0]['s3']['bucket']['name']
    keys=event['Records'][0]['s3']['object']['key']
    get_secret()
    
    print(f"this is the file name {keys}")
    
    # df = s3_to_df(bucket=bucket,keys=keys)
    conn = pyodbc.connect(connectionString)
    conn.execute(f"USE {db};")
    #you can change the engine connection string to connect to other RDS as well.
    engine = create_engine('mssql+pyodbc:///?odbc_connect={}'.format(connectionString))
    s3Client = boto3.client('s3')
    file_objects = s3Client.list_objects_v2(Bucket=bucket, Prefix=keys)['Contents']
    table_name=get_table_name(keys)
     
   

    for file_object in file_objects:
        file_key = file_object['Key']
    #there are two ways we can read the file. 
    # 1. use awsdatawrangler. This can help read the files in a single go
    # 2. read bigger files in chunks and upload them in every batch
    
    #method-1
    dfs=wr.s3.read_parquet(path=[f"s3://{bucket}/{file_key}"])
    #we can also pass chunked=True, so it will do it automatically or provide a number
    dfs=wr.s3.read_parquet(path=[f"s3://{bucket}/{file_key}"], chunked=20_000)
    

    for df in dfs:
        df.to_sql('table_name',engine, if_exists='append', index=False)
    
    #method-2
    file_obj = s3Client.get_object(Bucket=bucket, Key=file_key)
    parquet_file = pq.ParquetFile(BytesIO(file_obj['Body'].read()))
    # you can change the number 100000 with your number of choice to batch them appropriately
    for idx,batch in enumerate(parquet_file.iter_batches(100000)):
        print(f"RecordBatch-{idx}")
        batch_df = batch.to_pandas()
        rows,columns =batch_df.shape
        #batch is of 65536 records
        #you can further loop this by using the rows variable above, so you can load fewer rows
        batch_df.to_sql('table_name',engine, if_exists='append', index=False)

For this to execute we have to create layers.

These are the layers you need. Use the python3.9 runtime

AWSSDKPandas-Python39 (this is available in AWS. This contains numpy and pandas)
sqlalchemy
pyodbc

To create a layer, follow the below steps

create a folder in your local machine. If you are using ubuntu(linux), you can use these directly. If you are using windows, use WSL for linux capabilities to create these.
removing the unnecessary folders reduce the size of layers. AWS has limitation on the size of the layers.
If the layer is >10MB, load it to S3 and upload it to create a layer.

pip3 install sqlalchemy==2.0.21 --target python/
find . -name "tests" -type d | xargs -I{} rm -rf {}
find . -name "__pycache__" -type d | xargs -I{} rm -rf {}
find . -name "docs" -type d | xargs -I{} rm -rf {}
rm -rf boto*~
zip -r sqlalchemy_layer.zip python

Once the layers are ready, upload them to AWS lambda and you can run the lambda function.

To test the lambda function, you can either use a trigger, or create a test event.

Below is the process of creating a test event.

Open AWS Lambda
Click on the function you created
In the code tab, click on down arrow just beside “Test” button
Click on configure test event
Click on create new event
From template select appropriate trigger you need.
Change the values in the json.
save the event and trigger the lambda

You can call Lambda function from API. All you need to do is add a trigger to lambda function and select API gateway. Once you select API gateway, select HTTP and click okay.

Immediately you will get a link for the API gateway. You can call that API from your local machine(as long as you are in the same network) and trigger the API. You can call the API parallelly as well to get the respone. Below code can help create parallel threads

import requests
import asyncio
from timeit import default_timer
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio
nest_asyncio.apply() #this suppresses event loop already started error
import json

START_TIME = default_timer()

def request(session, data,idx):
    url = "https://xxxxxxx.amazonaws.com/default/lambda-function-name"
    print(data)
    with session.post(url,data=data,headers = {
  'Content-Type': 'text/plain'
}) as response:
        print(response)
        output = response.text

        if response.status_code != 200:
            print("FAILURE::{0}".format(url))

        elapsed_time = default_timer() - START_TIME
        completed_at = "{:5.2f}s".format(elapsed_time)
        print("{0:<30} {1:>20}".format(idx, completed_at))
        return output

async def start_async_process():
    print("{0:<30} {1:>20}".format("No", "Completed at"))
    with ThreadPoolExecutor(max_workers=10) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            START_TIME = default_timer()
            tasks = [
                loop.run_in_executor(
                    executor,
                    request,
                    *(session,json.dumps({
                                  "id": idx,
                                  "file_name": i.get('file_name'),
                                  "table_name":i.get('table_name'),
                                  "database_name": i.get('database_name')
                                }),idx)
                )
                for idx,i in enumerate([{'file_name': 'sample-1.parquet',
            'table_name': 'sample',
            'database_name': 'test'},
           {'file_name': 'sample-2.parquet',
            'table_name': 'sample',
            'database_name': 'test'}])
             ]
            for response in await asyncio.gather(*tasks):
                print(response)


if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(start_async_process())
    loop.run_until_complete(future)

Load RDS from S3 parquet files using lambda function

Written by Pathan

No responses yet