Asif Rahman

AWS Lambda Web Scraper

This is a small AWS Lambda function to scrape websites and store the data in a MongoDB document.

This is a small AWS Lambda function to scrape websites using axios and store the data in a MongoDB document. You can setup an API Gateway to the Lambda function and use GET requests to call the function.

Features

Install the required Node modules: npm install axios mongodb dotenv

const request = require('axios');
const MongoClient = require('mongodb').MongoClient;
const crypto = require('crypto');

// local .env files are loaded into process.env
require('dotenv').config({silent: false});

// load the MongoDB connection string from the .env file
const mongo_host = process.env.MONGO

// database and collection name
databaseName = 'scraper'
collectionName = 'rawdata'

// set of headers from which we will randomly select
let headers_list = [
	// Firefox 77 Mac
	 {
		"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
		"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
		"Accept-Language": "en-US,en;q=0.5",
		"Referer": "https://www.google.com/",
		"DNT": "1",
		"Connection": "keep-alive",
		"Upgrade-Insecure-Requests": "1"
	},
	// Firefox 77 Windows
	{
		"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
		"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
		"Accept-Language": "en-US,en;q=0.5",
		"Accept-Encoding": "gzip, deflate, br",
		"Referer": "https://www.google.com/",
		"DNT": "1",
		"Connection": "keep-alive",
		"Upgrade-Insecure-Requests": "1"
	},
	// Chrome 83 Mac
	{
		"Connection": "keep-alive",
		"DNT": "1",
		"Upgrade-Insecure-Requests": "1",
		"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
		"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
		"Sec-Fetch-Site": "none",
		"Sec-Fetch-Mode": "navigate",
		"Sec-Fetch-Dest": "document",
		"Referer": "https://www.google.com/",
		"Accept-Encoding": "gzip, deflate, br",
		"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
	},
	// Chrome 83 Windows 
	{
		"Connection": "keep-alive",
		"Upgrade-Insecure-Requests": "1",
		"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
		"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
		"Sec-Fetch-Site": "same-origin",
		"Sec-Fetch-Mode": "navigate",
		"Sec-Fetch-User": "?1",
		"Sec-Fetch-Dest": "document",
		"Referer": "https://www.google.com/",
		"Accept-Encoding": "gzip, deflate, br",
		"Accept-Language": "en-US,en;q=0.9"
	}
]

function isValidURL(string) {
	var res = string.match(/(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/g);
	return (res !== null)
};

module.exports.scrape = async event => {
	// start by parsing the body assuming a POST statement with a JSON body
	let body = JSON.parse(event.body)

	// url is required
	if (!('url' in body)) {
		return {ok: 0, msg: 'Missing URL'}
	}

	// check the url is valid
	if (!isValidURL(body.url)) {
		return {ok: 0, msg: 'Invalid URL'}
	}

	let url = body.url
	let host = new URL(url)

	// randomly select a header
	let headers = headers_list[Math.floor(Math.random() * headers_list.length)]
	// the request should look like it is originating from the host
	headers['Host'] = host.host
	// referer is from the same domain, referers from google.com are often
	// redirected, which we want to avoid
	headers['Referer'] = host.origin

	// set json headers if we expect the response to be in json
	if ('json' in body) {
		headers['Accept'] = 'application/json, text/javascript, */*; q=0.01'
	}

	// set XMLHttpRequest header, which helps when calling private APIs that 
	// would typically be loaded by AJAX calls
	if ('ajax' in body) {
		headers['X-Requested-With'] = 'XMLHttpRequest'
	}

	// send a GET request with our headers
	const response = await request({
		'url': url,
		'method': 'get',
		'headers': headers,
	});

	if (response.status == 200) {
		// create a data object containing the response body and headers
		let date = new Date()
		let data = {
			'url': url,
			'url_hash': crypto.createHash('md5').update(url).digest("hex"),
			'host': host.host,
			'data': response.data,
			'processed': false,
			'scraped_at': date,
			'scraped_year': date.getFullYear(),
			'scraped_month': date.getMonth() + 1,
			'scraped_day': date.getDate(),
			'response_headers': response.headers,
			'request_headers': headers
		}

		// create a connection to the MongoDB
		const client = await MongoClient.connect(mongo_host, {useUnifiedTopology: true});
		// select the database
		const db = client.db(databaseName);
		// insert data into collection in database
		let r = await db.collection(collectionName).insertOne(data);
		// close the connection to MongoDB
		client.close();

		if (r.insertedCount == 1) {
			// return the newly created ObjectID if a new document was successfully inserted
			return {
				ok: 1,
				url: url,
				insertedId: r.insertedId,
			};
		}
	} else {
		return {
			ok: 0,
			url: url,
			status: response.status,
			msg: 'Bad response status'
		};
	}

	return {
		ok: 0,
		url: url
	};
};

#Javascript