WSRicardo: Tracker - Get links from Html

This code use the basic module python native called "urllib" to get all links in web site from your html content read.

This code was wrote in a celular phone Android.

# Extract links in html content
# Author: Wsricardo
# Blog: wsricardo.blogspot.com
#
import urllib.request
import re

# Url for website address
url = 'your url'

# Function to get html content
# from web site (url site address).
req = lambda url: urllib.request.urlopen(url)
# Html file
with open('page.html', 'r') as f:
data= f.read()

# Use function req to get html content
#data= req( url ).read().decode('utf-8')

# Regular expression to find links
#g = re.findall(r'(?=href=\").*(?=\")', data)
g = re.findall(r"""<a[^>]+href\s*=\s*['"]([^"]+)['"][^>]*>""", data)

print(g)
for i in g:
    print(i)
count=0
# Save links in text file
f = open('links.txt', 'w')
for i in g:
    f.write(f'{i}\n')
    #count += 1
    #if count == 10:
    #   break
f.close()
#with open('page.html', 'w') as f:
#    f.write( data )

WSRicardo

Tracker - Get links from Html

Nenhum comentário:

Postar um comentário