# Extract links in html content
# Author: Wsricardo
# Blog: wsricardo.blogspot.com
#
import urllib.request
import re
# Url for website address
url = 'your url'
# Function to get html content
# from web site (url site address).
req = lambda url: urllib.request.urlopen(url)
# Html file
with open('page.html', 'r') as f:
data= f.read()
# Use function req to get html content
#data= req( url ).read().decode('utf-8')
# Regular expression to find links
#g = re.findall(r'(?=href=\").*(?=\")', data)
g = re.findall(r"""<a[^>]+href\s*=\s*['"]([^"]+)['"][^>]*>""", data)
print(g)
for i in g:
print(i)
count=0
# Save links in text file
f = open('links.txt', 'w')
for i in g:
f.write(f'{i}\n')
#count += 1
#if count == 10:
# break
f.close()
#with open('page.html', 'w') as f:
# f.write( data )
Nenhum comentário:
Postar um comentário