파이썬 Beautifulsoup을 활용한 크롤링 공부
2019. 8. 4. 00:32ㆍ크롤링
파이썬으로 사람인 크롤링해보기
취업준비를 시작하면서 처음으로 이력서 작성과 포트폴리오 작성을 하게 되었습니다. ㅠㅠ
많이 안 해본 서류 작업에 머리가 지끈지끈 아파서 쉬어가는 겸 옛날에 해킹 공부를 열심히 할 때 자주 사용했던 파이썬을 이용해서 구인공고 크롤링을 해봤습니다. 사실 만들고 나서도 그냥 사이트 들어가서 봅니다. ㅎ
실행 화면
해당 공고를 클릭하면 해당 url 링크로 이동합니다.
코드
import sys
from PyQt5.QtWidgets import *
from PyQt5 import uic
import threading
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import webbrowser
uiFilePath = 'crawlerui.ui'
url = {"사람인":"http://www.saramin.co.kr/zf_user/search?cat_cd=404&searchword={0}&recruitPage={1}&loc_mcd={2}&{3}&cat_cd=404&{4}"
,"잡플래닛":"https://www.jobplanet.co.kr/job_postings/search?query={0}&page={1}&city_ids%5B%5D={2}&occupation_level2_ids%5B%5D=11604",
'워크넷':'https://www.work.go.kr/empInfo/empInfoSrch/list/dtlEmpSrchList.do?keyword={0}&pageindex={1}®ion={2}&academicGbn={3}&careerTo={4}&careerTypes={5}'}
global index
index = 0
saraminCareer = {'신입':'exp_cd=1','경력':'exp_cd=2&exp_min=0&exp_max={0}','무관':'exp_none=y'}
saraminSchool = {'고졸':'edu_min=6&edu_max=9','대졸':'edu_min=8&edu_max=11','무관':'edu_none=y'}
saraminLocal = {'서울':'101000','인천':'108000','부산':'106000'}
jobPlanetCareer = {'신입':'1','경력':'2','무관':'4'}
jobPlanetSchool = {}
jobPlanetLocal = {'서울':'1','인천':'3','부산':'4'}
# 워크넷
worknetSchool={'고졸':'03','대졸2':'04','대졸4':'05'}
worknetRegion={'서울':11000,'부산':26000,'인천':28000}
class MainDialog(QDialog):
def __init__(self):
QDialog.__init__(self,None)
uic.loadUi(uiFilePath,self)
self.crawlerBtn.clicked.connect(self.startCrawler)
self.rowPosition = self.crawlerTable.rowCount()
self.crawlerTable.setColumnWidth(0,100)
self.crawlerTable.setColumnWidth(1, 400)
self.crawlerTable.setColumnWidth(3, 170)
self.crawlerTable.setColumnWidth(4, 0)
self.crawlerTable.clicked.connect(self.openUrl)
def openUrl(self):
row = self.crawlerTable.currentRow()
webbrowser.open(self.crawlerTable.item(row, 4).text())
def startCrawler(self):
print('starting crawler')
searchText = self.searchText.text()
pageNum = self.page.text()
saraminChecked = self.saramin.isChecked()
jobplanetChecked = self.jobplanet.isChecked()
worknetChecked = self.worknet.isChecked()
career = ''
careerText = ''
if(self.careerFirst.isChecked()):
career='신입'
elif(self.careerGosu.isChecked()):
career='경력'
careerText = self.careerGosuText.text()
school=''
if(self.highSchool.isChecked()):
school='고졸'
elif(self.university.isChecked()):
school='대졸'
elif(self.allSchool.isChecked()):
school='전체'
local=''
if(self.seoul.isChecked()):
local='서울'
elif(self.incheon.isChecked()):
local='인천'
elif(self.busan.isChecked()):
local='부산'
if(saraminChecked):
sThread = threading.Thread(target=self.threadCrawler,args=(url.get('사람인').format(urllib.parse.quote(searchText),1 if pageNum=='' else pageNum,saraminLocal[local],saraminSchool[school],
saraminCareer[career] if career != '경력' else saraminCareer[career].format(careerText)),self.saraminCrawler))
sThread.start()
if(jobplanetChecked):
print('잡플레닛 크롤링 해야함')
if (worknetChecked):
print('워크넷 크롤링 해야함')
def threadCrawler(self,saraminUrl,callback):
print(saraminUrl,'사람인 크롤링 시작')
callback(saraminUrl)
def saraminCrawler(self,saraminUrl):
saramUrl = 'http://www.saramin.co.kr'
print(saraminUrl,'사람인 크롤러 동작')
with urllib.request.urlopen(saraminUrl) as response:
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
title = [ title.text for title in soup.select('#recruit_info_list > div.content > div > div.area_job > h2 > a > span') ]
links = [ saramUrl+link.get('href') for link in soup.select('#recruit_info_list > div.content > div > div.area_job > h2 > a')]
company = [ company.text for company in soup.select('#recruit_info_list > div.content > div > div.area_corp > strong > a > span') ]
location = [ lo.text for lo in soup.select('#recruit_info_list > div.content > div > div.area_job > div.job_condition > span:nth-child(1) > a:nth-child(2)')]
for i,tableData in enumerate(zip(title,links,company,location)):
self.crawlerTable.insertRow(self.rowPosition)
self.crawlerTable.setItem(self.rowPosition, 0, QTableWidgetItem('사람인'))
self.crawlerTable.setItem(self.rowPosition, 1 , QTableWidgetItem(str(tableData[0])))
self.crawlerTable.setItem(self.rowPosition, 2, QTableWidgetItem(str(tableData[3])))
self.crawlerTable.setItem(self.rowPosition, 3, QTableWidgetItem(str(tableData[2])))
self.crawlerTable.setItem(self.rowPosition, 4, QTableWidgetItem(str(tableData[1])))
def jobplanetCrawler(self):
print('잡플래닛 크롤러 콜백')
def worknetCrawler(self):
print('워크넷 크롤러 콜백')
app = QApplication(sys.argv)
main_dialog = MainDialog()
main_dialog.show()
app.exec_()
잡플래닛과 워크넷도 하려고 했으나 취준생이라 시간이 남지 않아서 사람인만 했습니다. ( 핑계;; )
'크롤링' 카테고리의 다른 글
깃허브 오픈소스를 활용하여 네이버 뉴스기사 크롤링하기 (0) | 2019.09.06 |
---|---|
노드 cheerio-httpcli를 사용한 기사제목 크롤링 해보기 (1) | 2019.06.27 |