[Python] Applicazione multithreading

Dom 18 Mar 2007 22:59:28 CET

Ad esempio ho scaricato dalla rete questo script....

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Crawl a site and extract all unique URLs for html pages.

This script takes one argument: the url to the site to crawl.
If you want to store the ouput, pipe it to a file.

Usage example (output to console):
python crawlsite.py http://www.mysite.com

Usage example (output to file in Windows):
python crawlsite.py http://www.mysite.com > mylinks.txt

This script was written in a haste. Please report errors to
pete a standards-schmandards.com

This script uses the htmldata library by Connelly Barnes. Please
make sure it is available in the same folder.

"""

__author__ = 'Peter Krantz'
__version__ = '0.1'
__date__ = '2005/04/01'

import urllib2
import htmldata
import httplib
import sys
import urlparse
import codecs
import datetime

#Setup some basic parameters
useragentFirefox = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; 
rv:1.7.6) Gecko/20050223 Firefox/1.0.1"
useragentIE6 = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows 
NT 5.1; SV1;)"
useragentSelf = "Sitecrawler " + __version__ + " " + __date__ + " by " + 
__author__

skippedProtocols = ("javascript", "mailto", "ftp", "gopher")
validContentTypes = ("text/html", "application/xhtml+xml")

#get command line parameters
#Starting url
url = sys.argv[1]

#Get root url
urlparts = urlparse.urlsplit(url)
rootUrl = urlparts[0] + "://" + urlparts[1]

#List of parsed urls
parsedurls = []

#Is contenttype parsable?
def isParsable(contentType):
    result = False
    for validContentType in validContentTypes:
        if validContentType in contentType:
            result = True
            break

    return result

def stripFragment(url):
    urlparts = urlparse.urlsplit(url)

    protocol = urlparts[0]
    server = urlparts[1]
    path = urlparts[2]
    query = urlparts[3]
    fragment = urlparts[4]

    return protocol + "://" + server + path + query

def addUrlToHistory(url):
    global parsedurls

    urlparts = urlparse.urlsplit(url)

    protocol = urlparts[0]
    server = urlparts[1]
    path = urlparts[2]
    query = urlparts[3]
    fragment = urlparts[4]

    #Add url without fragment to list of parsed urls
    parsedurls.append(stripFragment(url))

#Check if URL exists. Returns status and content type.
def urlIsOk(url):
    global rootUrl
    global parsedurls

    try:

        #split the url to get the request item
        urlparts = urlparse.urlsplit(url)

        protocol = urlparts[0]
        server = urlparts[1]
        path = urlparts[2]
        fragment = urlparts[4]

        #Skip links where protocol is one of skippedProtocols
        if protocol in skippedProtocols:
            return (True, "unknown", 0)

        #Skip links to other sites
        if len(server) > 0:
            if url.find(rootUrl) == -1:
                return (False, "unknown", 0)

        #Skip same page links
        if len(fragment) > 0:
            if stripFragment(url) in parsedurls:
                return (False, "unknown", 0)

        #Check url header
        httpObj = httplib.HTTPConnection(server, 80)
        httpObj.connect()
        httpObj.putrequest('HEAD', path)
        httpObj.putheader('Accept', '*/*')
        httpObj.putheader('User-Agent', useragentSelf)
        httpObj.endheaders()
        response = httpObj.getresponse()
        contentType = response.getheader("content-type", "unknown")
        httpObj.close();

        if response.status != 200:
            if response.status == 301:
                #moved permanently - read location
                return urlIsOk(response.getheader("location"))
            if response.status == 302:
                #handle redirect
                return urlIsOk(response.getheader("location"))
            else:
                #server error message
                return (False, contentType, response.status)
        else:
            #Server reports url is OK.
            return (True, contentType, 200)

    except Exception:
        return (False, "unknown", 999)

def checkUrl(url):
    global currentUrl

    result = urlIsOk(url)
    if result[0]:
        #determine if link is crawlable
        if isParsable(result[1]):
            return True
        else:
            return False
    else:
        return False

#get html for a page
def getContent(url):
    try:
        contents = urllib2.urlopen(url).read()
        return contents
    except:
        return ""

#Get data
def printlinks(url, currentlevel):
    global recurselimit
    global pagetitle
    global parsedurls
    global currentUrl

    #Check if URL already parsed
    if not (stripFragment(url) in parsedurls):

        #check if url is ok
        if checkUrl(url):
            #Get doc
            currentUrl = url
            contents = getContent(url)

            #add title and url to list
            addUrlToHistory(url)

            #print url
            print url

            #recurse                                   
            links = htmldata.urlextract(contents, url)

            for u in links:
                printlinks(u.url, currentlevel)

#start script
printlinks(url, 0)

però per prendere i link di un sito di circa 10 pagine ci vuole un bel 
pò e vorrei velocizzarlo. mi dite come please....???

Valentino Volonghi aka Dialtone ha scritto:
> On Sun, 18 Mar 2007 22:36:05 +0100, "A.recca" 
> <a.recca a siciliawebplus.it> wrote:
>
>> Il mio unico problema sta nel fatto che vorrei velocizzare il 
>> programma e quindi renderlo multithreading impostando sempre da riga 
>> di comando il numero di thread che vorrei fare. Mi potete dare 
>> qualche consiglio???
>
> A me suona strano il 'quindi' in mezzo alla frase... Rendere un 
> programma veloce si fa non usando
> il multi-threading, non certo usandolo :). I client web in particolare 
> si scrivono in modo asincrono,
> controlla libevent/pyevent o, meglio secondo me, Twisted Matrix.
> ------------------------------------------------------------------------
>
> _______________________________________________
> Python mailing list
> Python a lists.python.it
> http://lists.python.it/mailman/listinfo/python
>