Extract All Links from HTML Content with Python

<!DOCTYPE html>
<html>
<head>
<title></title>
</head>
<body>
<a href="sas.svg">BannerImage</a>
<a href="http://www.google.com">Google</a>
<a href="https://www.dimensaoalfa.com.br">Notícias Dimensão ALfa</a>
<a href="www.wsricardo.blogspot.com">Meu Blog</a>
<a href="#">Banner</a>
<a href="noticias/n1.html">Notícias</a>
<a href="http://youtube.com">Youtube></a>
<a href="https://wikipedia.org">Wikipédia</a>
</body>
</html>
view raw main.html hosted with ❤ by GitHub
"""
Author: WSRicardo
Youtube: https://www.youtube.com/@dimensaoalfa
Blog: https://wsricardo.blogspot.com
Site: www.dimensaoalfa.com.br
Github: www.github.com/wsricardo
"""
from bs4 import BeautifulSoup
def get_links( webcontent ):
"""
get_links
arguments
webcoment - html text
return list[ {'url':'', 'text':'' } ]
Return list links found in html content.
"""
soup = BeautifulSoup(webcontent, 'html.parser' )
linksall = soup.find_all( 'a' )
links = [ ]
for link in linksall:
l = link.get( 'href' )
if l:
if ( 'http' in l or 'https' in l ) and len( link.text ) > 3 :
links.append( {
'url': l,
'text': link.text.strip( )
} )
return links
if __name__== "__main__":
html = ''
with open( "main.html", "r", encoding='utf-8' ) as fl:
html = fl.read()
print( html )
print( get_links( html ) )
view raw spider.py hosted with ❤ by GitHub

Nenhum comentário:

Postar um comentário