Asif Rahman

AWS Lambda Web Scraper

Posted on 2020-06-21

This is a small AWS Lambda function to scrape websites using axios and store the data in a MongoDB document. You can setup an API Gateway to the Lambda function and use GET requests to call the function.

Features

  • Randomly selects from a set of headers with each call.
  • Automatically sets the host and referer to the same domain.
  • Saves the response to MongoDB.
  • Optionally sets the header to json if you expect the output to be json format
  • Optionally sets the request to XMLHttpRequest

Install the required Node modules: npm install axios mongodb dotenv

const request = require('axios');
const MongoClient = require('mongodb').MongoClient;
const crypto = require('crypto');

// local .env files are loaded into process.env
require('dotenv').config({silent: false});

// load the MongoDB connection string from the .env file
const mongo_host = process.env.MONGO

// database and collection name
databaseName = 'scraper'
collectionName = 'rawdata'

// set of headers from which we will randomly select
let headers_list = [
    // Firefox 77 Mac
     {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Referer": "https://www.google.com/",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    },
    // Firefox 77 Windows
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Referer": "https://www.google.com/",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    },
    // Chrome 83 Mac
    {
        "Connection": "keep-alive",
        "DNT": "1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Dest": "document",
        "Referer": "https://www.google.com/",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
    },
    // Chrome 83 Windows 
    {
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-User": "?1",
        "Sec-Fetch-Dest": "document",
        "Referer": "https://www.google.com/",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9"
    }
]

function isValidURL(string) {
    var res = string.match(/(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/g);
    return (res !== null)
};

module.exports.scrape = async event => {
    // start by parsing the body assuming a POST statement with a JSON body
    let body = JSON.parse(event.body)

    // url is required
    if (!('url' in body)) {
        return {ok: 0, msg: 'Missing URL'}
    }

    // check the url is valid
    if (!isValidURL(body.url)) {
        return {ok: 0, msg: 'Invalid URL'}
    }

    let url = body.url
    let host = new URL(url)

    // randomly select a header
    let headers = headers_list[Math.floor(Math.random() * headers_list.length)]
    // the request should look like it is originating from the host
    headers['Host'] = host.host
    // referer is from the same domain, referers from google.com are often
    // redirected, which we want to avoid
    headers['Referer'] = host.origin

    // set json headers if we expect the response to be in json
    if ('json' in body) {
        headers['Accept'] = 'application/json, text/javascript, */*; q=0.01'
    }

    // set XMLHttpRequest header, which helps when calling private APIs that 
    // would typically be loaded by AJAX calls
    if ('ajax' in body) {
        headers['X-Requested-With'] = 'XMLHttpRequest'
    }

    // send a GET request with our headers
    const response = await request({
        'url': url,
        'method': 'get',
        'headers': headers,
    });

    if (response.status == 200) {
        // create a data object containing the response body and headers
        let date = new Date()
        let data = {
            'url': url,
            'url_hash': crypto.createHash('md5').update(url).digest("hex"),
            'host': host.host,
            'data': response.data,
            'processed': false,
            'scraped_at': date,
            'scraped_year': date.getFullYear(),
            'scraped_month': date.getMonth() + 1,
            'scraped_day': date.getDate(),
            'response_headers': response.headers,
            'request_headers': headers
        }

        // create a connection to the MongoDB
        const client = await MongoClient.connect(mongo_host, {useUnifiedTopology: true});
        // select the database
        const db = client.db(databaseName);
        // insert data into collection in database
        let r = await db.collection(collectionName).insertOne(data);
        // close the connection to MongoDB
        client.close();

        if (r.insertedCount == 1) {
            // return the newly created ObjectID if a new document was successfully inserted
            return {
                ok: 1,
                url: url,
                insertedId: r.insertedId,
            };
        }
    } else {
        return {
            ok: 0,
            url: url,
            status: response.status,
            msg: 'Bad response status'
        };
    }

    return {
        ok: 0,
        url: url
    };
};