Web scrapping with Python

Beautiful Soup is a lightweight and highly effective library that makes it a kids game to scrape information from web pages. We will use the requests library, too. It is a simple HTTP library for Python. It allows you to send HTTP requests in a very easy and intuitive way.

First, we need to install both libraries: pip install requests beautifulsoup4 lxml (the last one is the parser, html5lib is another option). Next, we will make a GET request: requests.get(url). The GET method is used to retrieve information from a server using a given URI -aka to ask for a specific html document.

import requests
from bs4 import BeautifulSoup as bs

def get_all_urls(url):
	my_urls = []
	
	response = requests.get(url)
	# The response object contains all the data sent from the web server in response to our GET request. 
	# It includes its headers and the data payload.
	if response.status_code == 200:
	# The status_code informs us of the status of the request. A 200 OK status means that the request was successful. However, a 404 NOT FOUND status means that the resource that we were looking for was not found on the server.
		soup = bs(response.content, "html.parser")

Therefore, if the request was successful, we are interested in the content of the request itself. response.content returns the raw bytes of the data payload.

soup is a BeautifulSoup instance which will be created from an HTML document and the parser that we are providing. To show the contents of the page, we can print it with the prettify() method: print(soup.prettify())

		my_links = soup.find_all("a", {"class": "btn btn-primary btn-block w-100"})

We can extract single or multiple occurrences of a specific tag. find takes the name of the tag and returns the first match. We will use find_all to extract all the occurrences of a given tag (“a”) within a document. find_all accept a list of tags, e.g., soup.find_all(“a”, “p”, “ul”).

We can refine the search and find elements by id, e.g., soup.find(‘div’, {“id”: “aParticularId”} )) or class name.

		for my_link in my_links: # find_all returns a list of elements rather than just the first element.
			my_urls.append(my_link.get("href"))

First, we have taken or isolated all “a” tags from a particular class (“btn btn-primary btn-block w-100”).
Second, we are extracting all the links contained within each href tag.
The get method is used here to retrieve values of attributes on a tag.
Sometimes, we are looking for the text within a tag: soup.find(‘p’).get_text()

	return my_urls
if __name__ == '__main__':
	get_all_urls("yourUrl.html")

Download All Images from A WebPage

Credits: PythonCode, How to Download All Images from a Web Page in Python

from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse
import os

def get_all_images(url):
    # It returns all image URLs from a particular page (url) 
    response = requests.get(url)
    soup = bs(response.content, "html.parser")

    urls = []
    for img in soup.find_all("img"):
	# We extract all img tags of the given url, and iterate over the list.
        img_url = img.attrs.get("src")
        if not img_url:
        # We can grab the image's url by accessing the "src" attribute. However, there are some "img" tags that do not contain it, we just get rid of them.
            continue
    
        # Next, we need to get the absolute URL. urljoin() constructs a full ("absolute") URL by joining or combining a base URL with the URL that was just extracted
        img_url = urljoin(url, img_url)

        try:
            pos = img_url.index("?")

There are some URLs that contains HTTP GET key-value pairs like

"http://justtothepoint.com/myImage.jpeg?key=value..."

img_url.index("?") returns the position of the “?” character, so we are only interested from the part of the URL till that position, i.e., 0…pos-1: img_url[:pos]

            img_url = img_url[:pos]
        except ValueError:
            pass

	parsed_img_url = urlparse(img_url)
	# urlparse parse a URL into six components, returning a 6-item named tuple. 
	# This corresponds to the general structure of a URL: scheme://netloc/path;parameters?query#fragment
	# Another option to remove URL Get parameters: parsed_href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

        if bool(parsed_img_url.netloc) and bool(parsed_img_url.scheme):
	# We append the image's URL to our list because we have identified this URL as a valid one.
 	# The URL is a valid one because it has a domain name (parsed_img_url.netloc) and a protocol (parsed_img_url.scheme) 
            urls.append(parsed_img_url)

    return urls 

	def createRemoveContentDirectory(pathname):
		"""
		If pathname does not exit, it will create it. Otherwise, it deletes all its files.
		"""
		# If path does not exist, we create it.
		if not os.path.isdir(pathname):
			os.makedirs(pathname)
		else:
			# Otherwise, we delete all its content.
			for f in os.listdir(pathname):
				os.remove(os.path.join(pathname, f))

	def crawl(url, path):
		createRemoveContentDirectory(path)
		# Let's get all images from the given URL and download them in path.
		imgs = get_all_images(url)
		for img in imgs:
			time.sleep(1) # This line is to avoid being blocked by the server hosting the page.
			# For each image, let's download it.
			download(img, path) # The download() function is good as it is in the mentioned article, so I will not include it.
if __name__ == '__main__':
    	crawl("myUrl","myDownloadPath")

Modify HTML files to change their images with optimized ones with safe backwards compatibility

Let’s change our old html file with jpg images:

<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<meta http-equiv="X-UA-Compatible" content="IE=edge">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Document</title>
</head>
<body>
	<img src='http://justtothepoint.com/wp-content/uploads/2016/07/img_girl.jpg' alt='Girl in a jacket' width='500' height='600'>
</body>
</html>

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title> Document </title>
 </head>
 <body>
  <picture>
   <img src="../myImages/img_girl.jpg"/>
  </picture>
 </body>
</html>

from bs4 import BeautifulSoup
import requests, os
def main(filename):
	with open(filename) as fp:
		soup = BeautifulSoup(fp, 'html.parser')

		for tag in soup.findAll("img"): 
			# We extract all img tags of the given file, and iterate over the list.
			stringImageName = os.path.basename(tag['src']).rsplit( ".", 1) [ 0 ] 
			# os.path.basename(path) returns the base name of pathname path, e.g., 
			# http://justtothepoint.com/wp-content/uploads/2016/07/img_girl.jpg returns img_girl.jpg
			# rsplit splits the base name into a list using the dot "." as a separator, e.g., [ "img_girl", "jpg" ] 
			stringImageExtension = os.path.basename(tag['src']).rsplit( ".", 1) [ 1 ] 
			stringImageWebp = "../myImages/" + stringImageName + '.webp'
			stringImageOld = "../myImages/" + stringImageName + '.' + stringImageExtension

			print(stringImageName)
			print(stringImageExtension)
			tag.replace_with(new_picture)

This is where the magic happens! BeautifulSoup’s replace_with() method replaces a tag or string in an element with the provided tag or string. We need to put a new soup as a parameter so it works as intended.

	<picture>
   		<img src="../myImages/img_girl.jpg"/>
	</picture>

 		with open("target.html", "w") as file:
		# Finally, we create a new HLML file with the new version.	
			file.write(soup.prettify())

if __name__ == '__main__':
    main("source.html")

Changing HTML files in batch

First, we are going to use glob to be able to track and iterate over all sub-folders of a particular folder, let’s say /home/myUser/myWebServer. We want to find all the html files recursively. When recursive is set (recursive = True), **/*.html walks the directories recursively below /home/myUser/myWebServer and returns all absolute path names matching *.html files.

 def main():
    files = glob.glob('/home/myUser/myWebServer/**/*.html', recursive = True)
    for file in files:
        print(file)
        processFile(file)
        
if __name__ == '__main__':
    main()

Next, we are going to process every HTML file programmatically.

def processFile(myfile):
    with open(myfile, "rb") as f:
        soup = Soup(f, 'html.parser')
        # That's how we can extract the language with Beautiful Soup. A tag may have any number of attributes. The html tag has an attribute "lang". We can access a tag's attribute by treating the tag like a dictionary.
        lang = soup.find("html")["lang"]
	# That's how we can extract the title with Beautiful Soup. Observe that a string corresponds to a bit of text within a tag.
        title = soup.find('title').string

        if soup.find_all("style")==[]: # If there is no "style" tag, we will add a new style.	
            addStyle(soup)

We are going to make all our pages responsive, so they can be viewable on a phone/table in a nice looking way. However, it is optimized for landscape mode. We are going to put a text on a div. If the user is in portrait mode, we will ask the user to rotate the device to proceed.

	<body>
		<div id="turn">
			<h1>Please rotate your device!</h1>
		</div>

So, let’s continue with our main function:

        if soup.find_all("div", id="turn")==[]: # If there are no "div" tags with an id of "turn". 
            addRotateDevice(soup, lang)

        for mylink in soup.find_all("a", href="myBrokenResourceLink"):
	# Imagine that you want to get rid of a tag and all its content, e.g., remove all links pointing to a particular URL
	# Firstly, we find all the tags "a" from the documents that are pointing to this link.
	# Secondly, we use Tag.decompose() to remove a tag from the tree and all its contents.
            mylink.decompose()
	# It will remove: <a href="myBrokenResourceLink">I was linking to a broken resource link.</a>

Next, we are going to modify JavaScript code in our HTML page. We can add/modify JavaScript code in an HTML document by using the dedicated HTML tag “script” that wraps around JavaScript code. However, we are using Jquery libraries, so we have the following lines in our code:

	<script src="../extras/jquery.min.1.7.js" type="text/javascript"></script>
	<script src="../extras/jquery-ui-1.8.20.custom.min.js" type="text/javascript"></script>

We will use the attribute src in the “script” tag. As you can observe in the previous lines, they are pointing to our jQuery libraries, but it is “None” in our old script.

        for myScript in soup.find_all("script", type="text/javascript"):
            if myScript.get('src') is None:
                myScript.string = myNewScript

Previously, the JavaScript code has been inserted in a string variable:

		myNewScript = """
		function loadApp() {

			$('#canvas').fadeIn(1000);

			var flipbook = $('.magazine');

			// Check if the CSS was already loaded
			if (flipbook.width()==0 || flipbook.height()==0) {
				setTimeout(loadApp, 10);
				return;
			}
			[...]
		"""

Finally, we update the file with the new content:

    with open(myfile, 'w', encoding='utf-8') as f:	
       f.write(str(soup))

Adding style to our HTML files.

myStyle = """
    .loader {
        border: 16px solid #f3f3f3; 
        border-top: 16px solid #3498db; 
        border-radius: 50%;
        width: 320px;
        height: 320px;
        animation: spin 2s linear infinite;
     }
	[...]
"""
def addStyle(soup):
    title = soup.find('title') # Firstly, we find the title tag.
    style = soup.new_tag('style') # Secondly, we are going to create a new tag: <style></style>
    style['type'] = "text/css" # Thirdly, we are going to add a "type" attribute to the style tab: <style type="text/css"></style>
    style.append(myStyle)  # Next, we are going to add content to our tag with Tag.append(). <style type="text/css">.# loader {
	#	border: 16px solid #f3f3f3; 
	#	[...] </style>
    title.insert_before(style) # Finally, we will insert this tag before the tittle.

This is the end result:

	<style type="text/css">
		.loader {
			border: 16px solid #f3f3f3; 
			border-top: 16px solid #3498db; 
			border-radius: 50%;
			width: 320px;
			height: 320px;
			animation: spin 2s linear infinite;
		}
		[...] 
	</style>
	<title>I don't want to read</title>

Let’s make our HTML pages responsive:

	def addRotateDevice(soup, language):
		body = soup.find('body') # Firstly, we find the body tag.
		divTurn = soup.new_tag('div') # Secondly, we are going to create a new tag: <div></div>
		divTurn['id'] = "turn" # Thirdly, we are going to add an "id" attribute to the div tab: <div id="turn"></div>
		new_tag = soup.new_tag("h1") # Next, we are going to create a second tag, "h1".
		if language=="en":  # Then, we are going to add content to our tag with Tag.append(). The text will depend on the user's language.
			new_tag.append("Please rotate your device!")
		else:
			new_tag.append("Por favor, rota tu dispositivo!")
		divTurn.append(new_tag) # The h1 tag is inserted inside the previously created div (with an id="turn") tab.
		body.insert(1, divTurn) # The divTurn is inserted just below the body tag.

Bibliography: Beautiful Soup Documentation

Download All Images from A WebPage

Modify HTML files to change their images with optimized ones with safe backwards compatibility

Changing HTML files in batch

Related Posts