makaleler / Python Programlama / Google arama (google results) sonuclarını alma işlemi

Google arama (google results) sonuclarını alma işlemi

21.10.2014 14:44:48

Python programlama dilini kullanarak google results sonuclarını alma

Python google arama sonuçlarını 'python google results' alarak işleyebileceğiniz aşağıdaki kodu inceleyebilirsiniz.

#!/usr/bin/python
#-*- coding:utf-8 -*-

###############################################################################
# 18.10.2014
# husonet
# Huseyin OZDEMIR
# Arama motorlarını parse islemi gerceklestirir.
###############################################################################

import random
import pycurl
import StringIO
import urllib
import re

USER_AGENTS = [
    "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "User-Agent: Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "User-Agent: Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)",
]

# url desen
RE_URL = 'href="/url\?q=(.*?)&.*?"'
RE_URL = re.compile(RE_URL, re.DOTALL | re.IGNORECASE)

# url toplam sonuc desen
RE_URL_TOPLAMSONUC = '<td nowrap align="right"><font size="-1">.*?([0-9\.]+).*?\(<b>'
RE_URL_TOPLAMSONUC = re.compile(RE_URL_TOPLAMSONUC, re.DOTALL | re.IGNORECASE)

# 1 ise debug yap
DEBUG = 0
class Arama():
    HOST     = 'www.google.%s'
    LANG    = None
    KEYWORD = None
    SSL     = None
    SAYFA   = None
    TOPLAMSONUC = None

    #--------------------------------------------------------------------------
    # Nesne ilk olusturuldugunda calisacak bolum. Eger verildiyse ilk degerler
    # set ediliyor
    def __init__(self, sTld = None, sLang = None, sKeyword = None, sSsl =
    False, sSayfa = None):
        if sTld is not None:
            self.HOST = self.HOST % (sTld)
        if sLang is not None:
            self.LANG = sLang
        if sKeyword is not None:
            self.KEYWORD = sKeyword
        if sSsl == True:
            self.SSL = '[url]https://[/url]'
        else:
            self.SSL = '[url]http://[/url]'
        if sSayfa is not None:
            self.SAYFA = sSayfa - 10

    #--------------------------------------------------------------------------
    # URL aç
    def getUrl(self,sSayfa=0):
        result = ''
        try:
            if sSayfa == 0:
                PAGE = '/search?hl=%s&q=%s&ie=utf-8' % (self.LANG,
            urllib.quote(self.KEYWORD))
            else:
                PAGE = '/search?hl=%s&q=%s&ie=utf-8&start=%d' % (self.LANG,
            urllib.quote(self.KEYWORD),sSayfa)
            #PAGE = '/search?hl=%s&q=%s&ie=utf-8' % (self.LANG, urllib.quote(self.KEYWORD))
            headers =  [
                random.choice(USER_AGENTS),
                #"User-Agent: Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)",
                "content-type':'text/plain"
            ]
            #print random.choice(USER_AGENTS)
            SIO    = StringIO.StringIO()
            URL     = self.SSL+self.HOST+PAGE
            ch      = pycurl.Curl()
            ch.setopt(ch.URL,URL)
            ch.setopt(ch.HEADER,True)
            ch.setopt(ch.FOLLOWLOCATION,1)
            ch.setopt(ch.SSL_VERIFYPEER, False)
            ch.setopt(ch.SSL_VERIFYHOST, False)
            ch.setopt(ch.HTTPHEADER,headers)
            ch.setopt(ch.WRITEFUNCTION, SIO.write)
            ch.perform()
            ch.close()
            result = SIO.getvalue()
        except Exception, err:
            if DEBUG:
                raise
            else:
                print(str(err))
            result = None
        return result

    #--------------------------------------------------------------------------
    # Parse işlemini TOPLAMSONUC icin dondurur
    def getParseToplamSonuc(self, sBody):
        try:
            result = 0
            stil   = RE_URL_TOPLAMSONUC.search(sBody)
            result = stil.group(1)
        except Exception, err:
            if DEBUG:
                raise
            else:
                print(str(err))
            result = None
        return result

    #--------------------------------------------------------------------------
    # Parse işlemini sonuçlara göre gerçekleştirir
    def getParse(self, sSayfa):
        try:
            result = []
            body = self.getUrl(sSayfa)
            # print body
            stil   = RE_URL.findall(body)
            for link in stil:
                if not link in result:
                    result.append(link)
            if sSayfa == 0:
                self.TOPLAMSONUC = self.getParseToplamSonuc(body)
        except Exception, err:
            if DEBUG:
                raise
            else:
                print(str(err))
            result = None
        return result

    #--------------------------------------------------------------------------
    # Execute arama sonuçlarını getirir
    def execute(self):
        try:
            result = []
            i = 0
            while i <= self.SAYFA:
                result += self.getParse(i)
                i += 10
        except Exception, err:
            if DEBUG:
                raise
            else:
                print(str(err))
            result = None
        return result

if __name__ == '__main__':
    a = Arama('com.tr', 'tr', 'python', True, 30)
    dizi = a.execute()
    i = 1
    for d in dizi:
        print str(i) + '-)' + d
        i += 1
yazar husonet

Yorumlar

Bu içerik için sizde yorum yapabilirsiniz!
anasayfa | makaleler | haberler | dosyalar | linkler | hakkımızda