#目的:讀取www.tsu.edu.tw網頁,如何顯示所有超連結的元素
#purpose: get the html tag from one internet web file
url = "https://acupun.site/lecture/python_data/index.htm"
#method of Web crawler: do 3 step: web file => text => html
#step(1):get web file
import urllib.request as request
web = request.urlopen(url)
#step(2):transfer to txt
txt = web.read()
#step(3):transfer to html tag
from bs4 import BeautifulSoup as soup
html = soup(txt,'html.parser')
#1.get the 1st tag
print('1.the 1st tag = ',html.a)
print('1.the txt of 1st tag = ',html.a.string)
#2.get the 2nd tag
#搜尋tag
#方法1:html.find_all('a')
#方法2:html.select('a')
#找到的第1個a: html.select('a')[0]
#找到的第2個a: html.select('a')[1]
print('2.the 2nd tag = ',html.select('a')[1])
print('2.the txt of 2nd tag = ',html.select('a')[1].string)
#3.get the 3rd tag, and get the parameter of href
#how could I get the parameter href of the tag
#method :html.a.get('href')
print('3.get the parameter href of the 3rd tag = ',html.select('a')[2].get('href'))
#key points 1: there are 2 kind of database
#(1).(small data): tradictional database: excel, csv, json, mysql
#(2).(big data): moderen internet web data from html tag
#key points 2: the data structure of internet web file
#(1).column name = html tag name, like
#(2).record data 1 = the string of tag, e.x. 首府大學
# command = html.a.string
#(2).record data 2 = the parameter of tag, e.x. 首府大學
# command = html.a.get('href')
#4.get all the string of
for k,v in enumerate(html.select('a')):
print(k, v.string)
#5.get all the href of
for k,v in enumerate(html.select('a')):
print(k, v.get('href'))
#6.get the data of id(exp2_1)
#顯示id="exp3a的innerhtml方法: html.select('#exp2_1')[0].string
print('6.get the data of id(exp2_1)=', html.select('#exp2_1')[0])