Web Scrapping#
# Scrapping website can be done using -
# 1) API
# 2) Web Scrapping using html
STEP 0) Install all the dependencies#
- pip install requests
- pip install bs4
- pip install html5lib
import requests
from bs4 import BeautifulSoup
url="https://www.codewithharry.com"
STEP 1) Get the html#
r = requests.get(url)
htmlContent = r.content
# print(htmlContent)
STEP 2) Parse the html#
soup = BeautifulSoup(htmlContent,'html.parser')
# print(soup.prettify)
STEP 3) Html tree traversals#
- Commonly used type of objects :
- i) Tag : print(type(title))
- ii) Navigable String : print(type(title.string))
- iii) BeautifulSoup : print(type(soup))
- iv) Comment
# Using comment object
markup="<p><!--this is a comment--></p>"
soup2 = BeautifulSoup(markup)
print(soup2.p)
print(type(soup2.p))
print(soup2.p.string)
print(type(soup2.p.string))
# Get the title of the Html page
title = soup.title
#Get all the paragraphs from the page
paras = soup.find_all('p')
print(paras)
# Get first element in the HTML page
print(soup.find('p'))
#Get classes of any element in the HTML page
print(soup.find('p')['class'])
#find all the elements with class lead
print(soup.find_all("p" , class_="lead"))
#Get the text from any tags/soup
print(soup.find('p').get_text())
#Get all the text from HTML
print(soup.get_text())
#Get all the anchor tags from the page
anchors = soup.find_all('a')
# print(anchors)
all_links = set()
#Get all the links on the page
for link in anchors:
if(link.get('href') != '#'):
linkText = "https://codewithharry.com"+link.get('href')
all_links.add(linkText)
# print(linkText)
for link in all_links:
print(link)
navbarSupportedContent = soup.find(id='navbarSupportedContent')
# .contents - A tag's children are available as a list(is stored in memory)
# .children - A tag's children are available as a generator ( can be iterated and is fast in case of big sites)
for elem in navbarSupportedContent.contents:
print(elem)
for elem in navbarSupportedContent.children:
print(elem)
#Using strings
for items in navbarSupportedContent.strings:
print(items)
#Beautifully arranging the strings
for items in navbarSupportedContent.stripped_strings:
print(items)
#Printing the parent tag of a tag
print(navbarSupportedContent.parent)
# A generator object is formed on printing using parents which shows it is iteratable
print(navbarSupportedContent.parents)
for item in navbarSupportedContent.parents:
print(item)
# Prints all the parent of the children
for item in navbarSupportedContent.parents:
print(item.name)
#next_sibling and previous_sibling
print(navbarSupportedContent.next_sibling) #Empty spaces and new lines are also considered as siblings
print(navbarSupportedContent.next_sibling.next_sibling )
print(navbarSupportedContent.previous_sibling) #Empty spaces and new lines are also considered as siblings
print(navbarSupportedContent.previous_sibling.previous_sibling)
#Getting an ID
elem = soup.select('#loginmodel')
print(elem)
#Getting an class
elem = soup.select('.loginmodel')
print(elem)