Web Scrapping

# Scrapping website can be done using - 
#     1) API
#     2) Web Scrapping using html

STEP 0) Install all the dependencies

  • pip install requests
  • pip install bs4
  • pip install html5lib
import requests
from bs4 import BeautifulSoup
url="https://www.codewithharry.com"

STEP 1) Get the html

r = requests.get(url)
htmlContent = r.content
# print(htmlContent)

STEP 2) Parse the html

soup = BeautifulSoup(htmlContent,'html.parser')
# print(soup.prettify)

STEP 3) Html tree traversals

  • Commonly used type of objects :
  • i) Tag : print(type(title))
  • ii) Navigable String : print(type(title.string))
  • iii) BeautifulSoup : print(type(soup))
  • iv) Comment
# Using comment object
markup="<p><!--this is a comment--></p>" 
soup2 = BeautifulSoup(markup)
print(soup2.p)
print(type(soup2.p))

print(soup2.p.string)
print(type(soup2.p.string))
# Get the title of the Html page  
title = soup.title

#Get all the paragraphs from the page 
paras = soup.find_all('p')
print(paras)


# Get first element in the HTML page
print(soup.find('p'))

#Get classes of any element in the HTML page 
print(soup.find('p')['class'])

#find all the elements with class lead 
print(soup.find_all("p" , class_="lead"))

#Get the text from any tags/soup
print(soup.find('p').get_text())

#Get all the text from HTML
print(soup.get_text())


#Get all the anchor tags from the page 
anchors = soup.find_all('a')

# print(anchors)
all_links = set()
#Get all the links on the page
for link in anchors:
    if(link.get('href') != '#'):
        linkText = "https://codewithharry.com"+link.get('href')
        all_links.add(linkText)
        # print(linkText)
for link in all_links:
    print(link)

navbarSupportedContent = soup.find(id='navbarSupportedContent')
# .contents - A tag's children are available as a list(is stored in memory)
# .children - A tag's children are available as a generator ( can be iterated and is fast in case of big sites)
for elem in navbarSupportedContent.contents:
    print(elem)

for elem in navbarSupportedContent.children:
    print(elem)

#Using strings
for items in navbarSupportedContent.strings:
    print(items)

#Beautifully arranging the strings 
for items in navbarSupportedContent.stripped_strings:
    print(items)

#Printing the parent tag of a tag
print(navbarSupportedContent.parent)

# A generator object is formed on printing using parents which shows it is iteratable
print(navbarSupportedContent.parents)

for item in navbarSupportedContent.parents:
    print(item)

# Prints all the parent of the children 
for item in navbarSupportedContent.parents:
    print(item.name)

#next_sibling and previous_sibling 
print(navbarSupportedContent.next_sibling) #Empty spaces and new lines are also considered as siblings
print(navbarSupportedContent.next_sibling.next_sibling )

print(navbarSupportedContent.previous_sibling) #Empty spaces and new lines are also considered as siblings
print(navbarSupportedContent.previous_sibling.previous_sibling)

#Getting an ID
elem = soup.select('#loginmodel')
print(elem)

#Getting an class
elem = soup.select('.loginmodel')
print(elem)