Asif Rahman

AWS Lambda Web Scraper

2020-06-21

This is a small AWS Lambda function to scrape websites using axios and store the data in a MongoDB document. You can setup an API Gateway to the Lambda function and use GET requests to call the function.

Features

  • Randomly selects from a set of headers with each call.
  • Automatically sets the host and referer to the same domain.
  • Saves the response to MongoDB.
  • Optionally sets the header to json if you expect the output to be json format
  • Optionally sets the request to XMLHttpRequest

Install the required Node modules: npm install axios mongodb dotenv

const request = require('axios');
const MongoClient = require('mongodb').MongoClient;
const crypto = require('crypto');

// local .env files are loaded into process.env
require('dotenv').config({silent: false});

// load the MongoDB connection string from the .env file
const mongo_host = process.env.MONGO

// database and collection name
databaseName = 'scraper'
collectionName = 'rawdata'

// set of headers from which we will randomly select
let headers_list = [
  // Firefox 77 Mac
   {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://www.google.com/",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
  },
  // Firefox 77 Windows
  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://www.google.com/",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
  },
  // Chrome 83 Mac
  {
    "Connection": "keep-alive",
    "DNT": "1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Dest": "document",
    "Referer": "https://www.google.com/",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
  },
  // Chrome 83 Windows 
  {
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-User": "?1",
    "Sec-Fetch-Dest": "document",
    "Referer": "https://www.google.com/",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9"
  }
]

function isValidURL(string) {
  var res = string.match(/(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/g);
  return (res !== null)
};

module.exports.scrape = async event => {
  // start by parsing the body assuming a POST statement with a JSON body
  let body = JSON.parse(event.body)

  // url is required
  if (!('url' in body)) {
    return {ok: 0, msg: 'Missing URL'}
  }

  // check the url is valid
  if (!isValidURL(body.url)) {
    return {ok: 0, msg: 'Invalid URL'}
  }

  let url = body.url
  let host = new URL(url)

  // randomly select a header
  let headers = headers_list[Math.floor(Math.random() * headers_list.length)]
  // the request should look like it is originating from the host
  headers['Host'] = host.host
  // referer is from the same domain, referers from google.com are often
  // redirected, which we want to avoid
  headers['Referer'] = host.origin

  // set json headers if we expect the response to be in json
  if ('json' in body) {
    headers['Accept'] = 'application/json, text/javascript, */*; q=0.01'
  }

  // set XMLHttpRequest header, which helps when calling private APIs that 
  // would typically be loaded by AJAX calls
  if ('ajax' in body) {
    headers['X-Requested-With'] = 'XMLHttpRequest'
  }

  // send a GET request with our headers
  const response = await request({
    'url': url,
    'method': 'get',
    'headers': headers,
  });

  if (response.status == 200) {
    // create a data object containing the response body and headers
    let date = new Date()
    let data = {
      'url': url,
      'url_hash': crypto.createHash('md5').update(url).digest("hex"),
      'host': host.host,
      'data': response.data,
      'processed': false,
      'scraped_at': date,
      'scraped_year': date.getFullYear(),
      'scraped_month': date.getMonth() + 1,
      'scraped_day': date.getDate(),
      'response_headers': response.headers,
      'request_headers': headers
    }

    // create a connection to the MongoDB
    const client = await MongoClient.connect(mongo_host, {useUnifiedTopology: true});
    // select the database
    const db = client.db(databaseName);
    // insert data into collection in database
    let r = await db.collection(collectionName).insertOne(data);
    // close the connection to MongoDB
    client.close();

    if (r.insertedCount == 1) {
      // return the newly created ObjectID if a new document was successfully inserted
      return {
        ok: 1,
        url: url,
        insertedId: r.insertedId,
      };
    }
  } else {
    return {
      ok: 0,
      url: url,
      status: response.status,
      msg: 'Bad response status'
    };
  }

  return {
    ok: 0,
    url: url
  };
};