husonet | Tarih: 21.10.2014
Google arama (google results) sonuclarını alma işlemi
Python programlama dilini kullanarak google results sonuclarını alma
Python google arama sonuçlarını 'python google results' alarak işleyebileceğiniz aşağıdaki kodu inceleyebilirsiniz.
#!/usr/bin/python
#-*- coding:utf-8 -*-
###############################################################################
# 18.10.2014
# husonet
# Huseyin OZDEMIR
# Arama motorlarını parse islemi gerceklestirir.
###############################################################################
import random
import pycurl
import StringIO
import urllib
import re
USER_AGENTS = [
"User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"User-Agent: Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"User-Agent: Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)",
]
# url desen
RE_URL = 'href="/url\?q=(.*?)&.*?"'
RE_URL = re.compile(RE_URL, re.DOTALL | re.IGNORECASE)
# url toplam sonuc desen
RE_URL_TOPLAMSONUC = '<td nowrap align="right"><font size="-1">.*?([0-9\.]+).*?\(<b>'
RE_URL_TOPLAMSONUC = re.compile(RE_URL_TOPLAMSONUC, re.DOTALL | re.IGNORECASE)
# 1 ise debug yap
DEBUG = 0
class Arama():
HOST = 'www.google.%s'
LANG = None
KEYWORD = None
SSL = None
SAYFA = None
TOPLAMSONUC = None
#--------------------------------------------------------------------------
# Nesne ilk olusturuldugunda calisacak bolum. Eger verildiyse ilk degerler
# set ediliyor
def __init__(self, sTld = None, sLang = None, sKeyword = None, sSsl =
False, sSayfa = None):
if sTld is not None:
self.HOST = self.HOST % (sTld)
if sLang is not None:
self.LANG = sLang
if sKeyword is not None:
self.KEYWORD = sKeyword
if sSsl == True:
self.SSL = 'https://'
else:
self.SSL = 'http://'
if sSayfa is not None:
self.SAYFA = sSayfa - 10
#--------------------------------------------------------------------------
# URL aç
def getUrl(self,sSayfa=0):
result = ''
try:
if sSayfa == 0:
PAGE = '/search?hl=%s&q=%s&ie=utf-8' % (self.LANG,
urllib.quote(self.KEYWORD))
else:
PAGE = '/search?hl=%s&q=%s&ie=utf-8&start=%d' % (self.LANG,
urllib.quote(self.KEYWORD),sSayfa)
#PAGE = '/search?hl=%s&q=%s&ie=utf-8' % (self.LANG, urllib.quote(self.KEYWORD))
headers = [
random.choice(USER_AGENTS),
#"User-Agent: Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)",
"content-type':'text/plain"
]
#print random.choice(USER_AGENTS)
SIO = StringIO.StringIO()
URL = self.SSL+self.HOST+PAGE
ch = pycurl.Curl()
ch.setopt(ch.URL,URL)
ch.setopt(ch.HEADER,True)
ch.setopt(ch.FOLLOWLOCATION,1)
ch.setopt(ch.SSL_VERIFYPEER, False)
ch.setopt(ch.SSL_VERIFYHOST, False)
ch.setopt(ch.HTTPHEADER,headers)
ch.setopt(ch.WRITEFUNCTION, SIO.write)
ch.perform()
ch.close()
result = SIO.getvalue()
except Exception, err:
if DEBUG:
raise
else:
print(str(err))
result = None
return result
#--------------------------------------------------------------------------
# Parse işlemini TOPLAMSONUC icin dondurur
def getParseToplamSonuc(self, sBody):
try:
result = 0
stil = RE_URL_TOPLAMSONUC.search(sBody)
result = stil.group(1)
except Exception, err:
if DEBUG:
raise
else:
print(str(err))
result = None
return result
#--------------------------------------------------------------------------
# Parse işlemini sonuçlara göre gerçekleştirir
def getParse(self, sSayfa):
try:
result = []
body = self.getUrl(sSayfa)
# print body
stil = RE_URL.findall(body)
for link in stil:
if not link in result:
result.append(link)
if sSayfa == 0:
self.TOPLAMSONUC = self.getParseToplamSonuc(body)
except Exception, err:
if DEBUG:
raise
else:
print(str(err))
result = None
return result
#--------------------------------------------------------------------------
# Execute arama sonuçlarını getirir
def execute(self):
try:
result = []
i = 0
while i <= self.SAYFA:
result += self.getParse(i)
i += 10
except Exception, err:
if DEBUG:
raise
else:
print(str(err))
result = None
return result
if __name__ == '__main__':
a = Arama('com.tr', 'tr', 'python', True, 30)
dizi = a.execute()
i = 1
for d in dizi:
print str(i) + '-)' + d
i += 1