#目的：讀取www.tsu.edu.tw網頁，如何顯示所有超連結的元素
#purpose: get the html tag from one internet web file
url = "https://acupun.site/lecture/python_data/index.htm"

#method of Web crawler: do 3 step: web file => text => html

#step(1):get web file
import urllib.request as request
web = request.urlopen(url)

#step(2):transfer to txt
txt = web.read()

#step(3):transfer to html tag
from bs4 import BeautifulSoup as soup
html = soup(txt,'html.parser')

#1.get the 1st <a> tag
print('1.the 1st <a> tag = ',html.a)
print('1.the txt of 1st <a> tag = ',html.a.string)


#2.get the 2nd <a> tag
#搜尋tag
#方法1：html.find_all('a')
#方法2：html.select('a')

#找到的第1個a: html.select('a')[0]
#找到的第2個a: html.select('a')[1]
print('2.the 2nd <a> tag = ',html.select('a')[1])
print('2.the txt of 2nd <a> tag = ',html.select('a')[1].string)

#3.get the 3rd <a> tag, and get the parameter of href
#how could I get the parameter href of the tag <a>
#method :html.a.get('href')
print('3.get the parameter href of the 3rd tag <a> = ',html.select('a')[2].get('href'))


#key points 1: there are 2 kind of database
#(1).(small data): tradictional database: excel, csv, json, mysql 
#(2).(big data): moderen internet web data from html tag

#key points 2: the data structure of internet web file
#(1).column name = html tag name, like <a>
#(2).record data 1 = the string of tag, e.x. <a>首府大學</a>
#          command = html.a.string
#(2).record data 2 = the parameter of tag, e.x. <a href='http://www.tsu.edu.tw'>首府大學</a>
#          command = html.a.get('href')

#4.get all the string of <a>
for k,v in enumerate(html.select('a')):
    print(k, v.string)

#5.get all the href of <a>
for k,v in enumerate(html.select('a')):
    print(k, v.get('href'))
    
    
#6.get the data of id(exp2_1)
#顯示id="exp3a的innerhtml方法: html.select('#exp2_1')[0].string
print('6.get the data of id(exp2_1)=', html.select('#exp2_1')[0])