[Python] Applicazione multithreading
A.recca
a.recca a siciliawebplus.it
Dom 18 Mar 2007 22:59:28 CET
Ad esempio ho scaricato dalla rete questo script....
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Crawl a site and extract all unique URLs for html pages.
This script takes one argument: the url to the site to crawl.
If you want to store the ouput, pipe it to a file.
Usage example (output to console):
python crawlsite.py http://www.mysite.com
Usage example (output to file in Windows):
python crawlsite.py http://www.mysite.com > mylinks.txt
This script was written in a haste. Please report errors to
pete a standards-schmandards.com
This script uses the htmldata library by Connelly Barnes. Please
make sure it is available in the same folder.
"""
__author__ = 'Peter Krantz'
__version__ = '0.1'
__date__ = '2005/04/01'
import urllib2
import htmldata
import httplib
import sys
import urlparse
import codecs
import datetime
#Setup some basic parameters
useragentFirefox = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US;
rv:1.7.6) Gecko/20050223 Firefox/1.0.1"
useragentIE6 = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows
NT 5.1; SV1;)"
useragentSelf = "Sitecrawler " + __version__ + " " + __date__ + " by " +
__author__
skippedProtocols = ("javascript", "mailto", "ftp", "gopher")
validContentTypes = ("text/html", "application/xhtml+xml")
#get command line parameters
#Starting url
url = sys.argv[1]
#Get root url
urlparts = urlparse.urlsplit(url)
rootUrl = urlparts[0] + "://" + urlparts[1]
#List of parsed urls
parsedurls = []
#Is contenttype parsable?
def isParsable(contentType):
result = False
for validContentType in validContentTypes:
if validContentType in contentType:
result = True
break
return result
def stripFragment(url):
urlparts = urlparse.urlsplit(url)
protocol = urlparts[0]
server = urlparts[1]
path = urlparts[2]
query = urlparts[3]
fragment = urlparts[4]
return protocol + "://" + server + path + query
def addUrlToHistory(url):
global parsedurls
urlparts = urlparse.urlsplit(url)
protocol = urlparts[0]
server = urlparts[1]
path = urlparts[2]
query = urlparts[3]
fragment = urlparts[4]
#Add url without fragment to list of parsed urls
parsedurls.append(stripFragment(url))
#Check if URL exists. Returns status and content type.
def urlIsOk(url):
global rootUrl
global parsedurls
try:
#split the url to get the request item
urlparts = urlparse.urlsplit(url)
protocol = urlparts[0]
server = urlparts[1]
path = urlparts[2]
fragment = urlparts[4]
#Skip links where protocol is one of skippedProtocols
if protocol in skippedProtocols:
return (True, "unknown", 0)
#Skip links to other sites
if len(server) > 0:
if url.find(rootUrl) == -1:
return (False, "unknown", 0)
#Skip same page links
if len(fragment) > 0:
if stripFragment(url) in parsedurls:
return (False, "unknown", 0)
#Check url header
httpObj = httplib.HTTPConnection(server, 80)
httpObj.connect()
httpObj.putrequest('HEAD', path)
httpObj.putheader('Accept', '*/*')
httpObj.putheader('User-Agent', useragentSelf)
httpObj.endheaders()
response = httpObj.getresponse()
contentType = response.getheader("content-type", "unknown")
httpObj.close();
if response.status != 200:
if response.status == 301:
#moved permanently - read location
return urlIsOk(response.getheader("location"))
if response.status == 302:
#handle redirect
return urlIsOk(response.getheader("location"))
else:
#server error message
return (False, contentType, response.status)
else:
#Server reports url is OK.
return (True, contentType, 200)
except Exception:
return (False, "unknown", 999)
def checkUrl(url):
global currentUrl
result = urlIsOk(url)
if result[0]:
#determine if link is crawlable
if isParsable(result[1]):
return True
else:
return False
else:
return False
#get html for a page
def getContent(url):
try:
contents = urllib2.urlopen(url).read()
return contents
except:
return ""
#Get data
def printlinks(url, currentlevel):
global recurselimit
global pagetitle
global parsedurls
global currentUrl
#Check if URL already parsed
if not (stripFragment(url) in parsedurls):
#check if url is ok
if checkUrl(url):
#Get doc
currentUrl = url
contents = getContent(url)
#add title and url to list
addUrlToHistory(url)
#print url
print url
#recurse
links = htmldata.urlextract(contents, url)
for u in links:
printlinks(u.url, currentlevel)
#start script
printlinks(url, 0)
però per prendere i link di un sito di circa 10 pagine ci vuole un bel
pò e vorrei velocizzarlo. mi dite come please....???
Valentino Volonghi aka Dialtone ha scritto:
> On Sun, 18 Mar 2007 22:36:05 +0100, "A.recca"
> <a.recca a siciliawebplus.it> wrote:
>
>> Il mio unico problema sta nel fatto che vorrei velocizzare il
>> programma e quindi renderlo multithreading impostando sempre da riga
>> di comando il numero di thread che vorrei fare. Mi potete dare
>> qualche consiglio???
>
> A me suona strano il 'quindi' in mezzo alla frase... Rendere un
> programma veloce si fa non usando
> il multi-threading, non certo usandolo :). I client web in particolare
> si scrivono in modo asincrono,
> controlla libevent/pyevent o, meglio secondo me, Twisted Matrix.
> ------------------------------------------------------------------------
>
> _______________________________________________
> Python mailing list
> Python a lists.python.it
> http://lists.python.it/mailman/listinfo/python
>
Maggiori informazioni sulla lista
Python