#目的:讀取www.tsu.edu.tw網頁,如何顯示所有超連結的元素 #purpose: get the html tag from one internet web file url = "https://acupun.site/lecture/python_data/index.htm" #method of Web crawler: do 3 step: web file => text => html #step(1):get web file import urllib.request as request web = request.urlopen(url) #step(2):transfer to txt txt = web.read() #step(3):transfer to html tag from bs4 import BeautifulSoup as soup html = soup(txt,'html.parser') #1.get the 1st tag print('1.the 1st tag = ',html.a) print('1.the txt of 1st tag = ',html.a.string) #2.get the 2nd tag #搜尋tag #方法1:html.find_all('a') #方法2:html.select('a') #找到的第1個a: html.select('a')[0] #找到的第2個a: html.select('a')[1] print('2.the 2nd tag = ',html.select('a')[1]) print('2.the txt of 2nd tag = ',html.select('a')[1].string) #3.get the 3rd tag, and get the parameter of href #how could I get the parameter href of the tag #method :html.a.get('href') print('3.get the parameter href of the 3rd tag = ',html.select('a')[2].get('href')) #key points 1: there are 2 kind of database #(1).(small data): tradictional database: excel, csv, json, mysql #(2).(big data): moderen internet web data from html tag #key points 2: the data structure of internet web file #(1).column name = html tag name, like #(2).record data 1 = the string of tag, e.x. 首府大學 # command = html.a.string #(2).record data 2 = the parameter of tag, e.x. 首府大學 # command = html.a.get('href') #4.get all the string of for k,v in enumerate(html.select('a')): print(k, v.string) #5.get all the href of for k,v in enumerate(html.select('a')): print(k, v.get('href')) #6.get the data of id(exp2_1) #顯示id="exp3a的innerhtml方法: html.select('#exp2_1')[0].string print('6.get the data of id(exp2_1)=', html.select('#exp2_1')[0])