Node.js Script to Retrieve First Page of PDF Files from S3 Bucket

Node.js Script to Retrieve First Page of PDF Files from S3 Bucket

·

2 min read

If you want to process multiple PDF files and extract only the first page from each of them, you can modify the code to iterate over the files.

First Do you check you have aws-sdk if not then install it

npm i aws-sdk
javascriptCopy codeconst AWS = require('aws-sdk');
const PDFJS = require('pdfjs-dist');

AWS.config.update({
  accessKeyId: 'YOUR_ACCESS_KEY',
  secretAccessKey: 'YOUR_SECRET_ACCESS_KEY',
  region: 'YOUR_AWS_REGION'
});

const s3 = new AWS.S3();
const bucketName = 'YOUR_BUCKET_NAME';

async function downloadPdf(fileName) {
  const params = {
    Bucket: bucketName,
    Key: fileName
  };

  try {
    const response = await s3.getObject(params).promise();
    return response.Body;
  } catch (error) {
    console.error(`Error downloading PDF ${fileName}:`, error);
    throw error;
  }
}

async function extractFirstPage(pdfData) {
  try {
    const pdf = await PDFJS.getDocument(pdfData);
    const page = await pdf.getPage(1); // Fetch the first page (index starts at 1)
    const pageData = await page.getData();
    return pageData;
  } catch (error) {
    console.error('Error extracting first page:', error);
    throw error;
  }
}

async function processFiles(fileNames) {
  for (const fileName of fileNames) {
    try {
      console.log(`Processing file: ${fileName}`);
      const pdfData = await downloadPdf(fileName);
      const firstPageData = await extractFirstPage(pdfData);
      console.log(`First page of ${fileName} extracted successfully:`, firstPageData);
    } catch (error) {
      console.error(`Error processing file ${fileName}:`, error);
    }
  }
}

// Example usage: process a list of file names
const fileList = ['file1.pdf', 'file2.pdf', 'file3.pdf'];
processFiles(fileList);

In the above code, the processFiles function accepts an array of file names and iterates over them, downloading each PDF file and extracting the first page. The extracted page data is then logged to the console. You can modify the fileList array to contain the names of the PDF files you want to process.

Remember to replace the placeholders (YOUR_ACCESS_KEY, YOUR_SECRET_ACCESS_KEY, YOUR_AWS_REGION, and YOUR_BUCKET_NAME) with your actual AWS credentials and S3 information.

folks, that's it

Did you find this article valuable?

Support ramu k by becoming a sponsor. Any amount is appreciated!