pythontr.com
#!/usr/bin/python #-*- coding:utf-8 -*- ############################################################################### # 18.10.2014 # husonet # Huseyin OZDEMIR # Arama motorlarını parse islemi gerceklestirir. ############################################################################### import random import pycurl import StringIO import urllib import re USER_AGENTS = [ "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "User-Agent: Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "User-Agent: Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)", ] # url desen RE_URL = 'href="/url\?q=(.*?)&.*?"' RE_URL = re.compile(RE_URL, re.DOTALL | re.IGNORECASE) # url toplam sonuc desen RE_URL_TOPLAMSONUC = '<td nowrap align="right"><font size="-1">.*?([0-9\.]+).*?\(<b>' RE_URL_TOPLAMSONUC = re.compile(RE_URL_TOPLAMSONUC, re.DOTALL | re.IGNORECASE) # 1 ise debug yap DEBUG = 0 class Arama(): HOST = 'www.google.%s' LANG = None KEYWORD = None SSL = None SAYFA = None TOPLAMSONUC = None #-------------------------------------------------------------------------- # Nesne ilk olusturuldugunda calisacak bolum. Eger verildiyse ilk degerler # set ediliyor def __init__(self, sTld = None, sLang = None, sKeyword = None, sSsl = False, sSayfa = None): if sTld is not None: self.HOST = self.HOST % (sTld) if sLang is not None: self.LANG = sLang if sKeyword is not None: self.KEYWORD = sKeyword if sSsl == True: self.SSL = '[url]https://[/url]' else: self.SSL = '[url]http://[/url]' if sSayfa is not None: self.SAYFA = sSayfa - 10 #-------------------------------------------------------------------------- # URL aç def getUrl(self,sSayfa=0): result = '' try: if sSayfa == 0: PAGE = '/search?hl=%s&q=%s&ie=utf-8' % (self.LANG, urllib.quote(self.KEYWORD)) else: PAGE = '/search?hl=%s&q=%s&ie=utf-8&start=%d' % (self.LANG, urllib.quote(self.KEYWORD),sSayfa) #PAGE = '/search?hl=%s&q=%s&ie=utf-8' % (self.LANG, urllib.quote(self.KEYWORD)) headers = [ random.choice(USER_AGENTS), #"User-Agent: Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)", "content-type':'text/plain" ] #print random.choice(USER_AGENTS) SIO = StringIO.StringIO() URL = self.SSL+self.HOST+PAGE ch = pycurl.Curl() ch.setopt(ch.URL,URL) ch.setopt(ch.HEADER,True) ch.setopt(ch.FOLLOWLOCATION,1) ch.setopt(ch.SSL_VERIFYPEER, False) ch.setopt(ch.SSL_VERIFYHOST, False) ch.setopt(ch.HTTPHEADER,headers) ch.setopt(ch.WRITEFUNCTION, SIO.write) ch.perform() ch.close() result = SIO.getvalue() except Exception, err: if DEBUG: raise else: print(str(err)) result = None return result #-------------------------------------------------------------------------- # Parse işlemini TOPLAMSONUC icin dondurur def getParseToplamSonuc(self, sBody): try: result = 0 stil = RE_URL_TOPLAMSONUC.search(sBody) result = stil.group(1) except Exception, err: if DEBUG: raise else: print(str(err)) result = None return result #-------------------------------------------------------------------------- # Parse işlemini sonuçlara göre gerçekleştirir def getParse(self, sSayfa): try: result = [] body = self.getUrl(sSayfa) # print body stil = RE_URL.findall(body) for link in stil: if not link in result: result.append(link) if sSayfa == 0: self.TOPLAMSONUC = self.getParseToplamSonuc(body) except Exception, err: if DEBUG: raise else: print(str(err)) result = None return result #-------------------------------------------------------------------------- # Execute arama sonuçlarını getirir def execute(self): try: result = [] i = 0 while i <= self.SAYFA: result += self.getParse(i) i += 10 except Exception, err: if DEBUG: raise else: print(str(err)) result = None return result if __name__ == '__main__': a = Arama('com.tr', 'tr', 'python', True, 30) dizi = a.execute() i = 1 for d in dizi: print str(i) + '-)' + d i += 1
Yorumlar