阿布云

你所需要的,不仅仅是一个好用的代理。

链家爬虫 python

阿布云 发表于

 

链家爬虫 python 

 

最近计划在北京买房,谁想房价开始疯长,链家的房价等数据分析只给了一小部分,远远不能满足自己的需求。于是晚上花了几个小时的时间写了个爬虫,爬下了北京所有的小区信息及北京的所有历史成交记录。将代码贴在此以分享给需要的人。

Python

# -*- coding: utf-8 -*- """ @author: 冰蓝 @site: http://lanbing510.info """ import re import urllib2 import sqlite3 import random import threading from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding("utf-8") #登录,不登录不能爬取三个月之内的数据 import LianJiaLogIn #Some User Agents hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\ {'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},\ {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},\ {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},\ {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\ {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\ {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},\ {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\ {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\ {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},\ {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},\ {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}] #北京区域列表 regions=[u"东城",u"西城",u"朝阳",u"海淀",u"丰台",u"石景山","通州",u"昌平",u"大兴",u"亦庄开发区",u"顺义",u"房山",u"门头沟",u"平谷",u"怀柔",u"密云",u"延庆",u"燕郊"] lock = threading.Lock() class SQLiteWraper(object): """ 数据库的一个小封装,更好的处理多线程写入 """ def __init__(self,path,command='',*args,**kwargs): self.lock = threading.RLock() #锁 self.path = path #数据库连接参数 if command!='': conn=self.get_conn() cu=conn.cursor() cu.execute(command) def get_conn(self): conn = sqlite3.connect(self.path)#,check_same_thread=False) conn.text_factory=str return conn def conn_close(self,conn=None): conn.close() def conn_trans(func): def connection(self,*args,**kwargs): self.lock.acquire() conn = self.get_conn() kwargs['conn'] = conn rs = func(self,*args,**kwargs) self.conn_close(conn) self.lock.release() return rs return connection @conn_trans def execute(self,command,method_flag=0,conn=None): cu = conn.cursor() try: if not method_flag: cu.execute(command) else: cu.execute(command[0],command[1]) conn.commit() except sqlite3.IntegrityError,e: #print e return -1 except Exception, e: print e return -2 return 0 @conn_trans def fetchall(self,command="select name from xiaoqu",conn=None): cu=conn.cursor() lists=[] try: cu.execute(command) lists=cu.fetchall() except Exception,e: print e pass return lists def gen_xiaoqu_insert_command(info_dict): """ 生成小区数据库插入命令 """ info_list=[u'小区名称',u'大区域',u'小区域',u'小区户型',u'建造时间'] t=[] for il in info_list: if il in info_dict: t.append(info_dict[il]) else: t.append('') t=tuple(t) command=(r"insert into xiaoqu values(?,?,?,?,?)",t) return command def gen_chengjiao_insert_command(info_dict): """ 生成成交记录数据库插入命令 """ info_list=[u'链接',u'小区名称',u'户型',u'面积',u'朝向',u'楼层',u'建造时间',u'签约时间',u'签约单价',u'签约总价',u'房产类型',u'学区',u'地铁'] t=[] for il in info_list: if il in info_dict: t.append(info_dict[il]) else: t.append('') t=tuple(t) command=(r"insert into chengjiao values(?,?,?,?,?,?,?,?,?,?,?,?,?)",t) return command def xiaoqu_spider(db_xq,url_page=u"http://bj.lianjia.com/xiaoqu/pg1rs%E6%98%8C%E5%B9%B3/"): """ 爬取页面链接中的小区信息 """ try: req = urllib2.Request(url_page,headers=hds[random.randint(0,len(hds)-1)]) source_code = urllib2.urlopen(req,timeout=10).read() plain_text=unicode(source_code)#,errors='ignore') soup = BeautifulSoup(plain_text) except (urllib2.HTTPError, urllib2.URLError), e: print e exit(-1) except Exception,e: print e exit(-1) xiaoqu_list=soup.findAll('div',{'class':'info-panel'}) for xq in xiaoqu_list: info_dict={} info_dict.update({u'小区名称':xq.find('a').text}) content=unicode(xq.find('div',{'class':'con'}).renderContents().strip()) info=re.match(r".+>(.+)</a>.+>(.+)</a>.+</span>(.+)<span>.+</span>(.+)",content) if info: info=info.groups() info_dict.update({u'大区域':info[0]}) info_dict.update({u'小区域':info[1]}) info_dict.update({u'小区户型':info[2]}) info_dict.update({u'建造时间':info[3][:4]}) command=gen_xiaoqu_insert_command(info_dict) db_xq.execute(command,1) def do_xiaoqu_spider(db_xq,region=u"昌平"): """ 爬取大区域中的所有小区信息 """ url=u"http://bj.lianjia.com/xiaoqu/rs"+region+"/" try: req = urllib2.Request(url,headers=hds[random.randint(0,len(hds)-1)]) source_code = urllib2.urlopen(req,timeout=5).read() plain_text=unicode(source_code)#,errors='ignore') soup = BeautifulSoup(plain_text) except (urllib2.HTTPError, urllib2.URLError), e: print e return except Exception,e: print e return d="d="+soup.find('div',{'class':'page-box house-lst-page-box'}).get('page-data') exec(d) total_pages=d['totalPage'] threads=[] for i in range(total_pages): url_page=u"http://bj.lianjia.com/xiaoqu/pg%drs%s/" % (i+1,region) t=threading.Thread(target=xiaoqu_spider,args=(db_xq,url_page)) threads.append(t) for t in threads: t.start() for t in threads: t.join() print u"爬下了 %s 区全部的小区信息" % region def chengjiao_spider(db_cj,url_page=u"http://bj.lianjia.com/chengjiao/pg1rs%E5%86%A0%E5%BA%AD%E5%9B%AD"): """ 爬取页面链接中的成交记录 """ try: req = urllib2.Request(url_page,headers=hds[random.randint(0,len(hds)-1)]) source_code = urllib2.urlopen(req,timeout=10).read() plain_text=unicode(source_code)#,errors='ignore') soup = BeautifulSoup(plain_text) except (urllib2.HTTPError, urllib2.URLError), e: print e exception_write('chengjiao_spider',url_page) return except Exception,e: print e exception_write('chengjiao_spider',url_page) return cj_list=soup.findAll('div',{'class':'info-panel'}) for cj in cj_list: info_dict={} href=cj.find('a') if not href: continue info_dict.update({u'链接':href.attrs['href']}) content=cj.find('h2').text.split() if content: info_dict.update({u'小区名称':content[0]}) info_dict.update({u'户型':content[1]}) info_dict.update({u'面积':content[2]}) content=unicode(cj.find('div',{'class':'con'}).renderContents().strip()) content=content.split('/') if content: info_dict.update({u'朝向':content[0].strip()}) info_dict.update({u'楼层':content[1].strip()}) if len(content)>=3: content[2]=content[2].strip(); info_dict.update({u'建造时间':content[2][:4]}) content=cj.findAll('div',{'class':'div-cun'}) if content: info_dict.update({u'签约时间':content[0].text}) info_dict.update({u'签约单价':content[1].text}) info_dict.update({u'签约总价':content[2].text}) content=cj.find('div',{'class':'introduce'}).text.strip().split() if content: for c in content: if c.find(u'满')!=-1: info_dict.update({u'房产类型':c}) elif c.find(u'学')!=-1: info_dict.update({u'学区':c}) elif c.find(u'距')!=-1: info_dict.update({u'地铁':c}) command=gen_chengjiao_insert_command(info_dict) db_cj.execute(command,1) def xiaoqu_chengjiao_spider(db_cj,xq_name=u"冠庭园"): """ 爬取小区成交记录 """ url=u"http://bj.lianjia.com/chengjiao/rs"+urllib2.quote(xq_name)+"/" try: req = urllib2.Request(url,headers=hds[random.randint(0,len(hds)-1)]) source_code = urllib2.urlopen(req,timeout=10).read() plain_text=unicode(source_code)#,errors='ignore') soup = BeautifulSoup(plain_text) except (urllib2.HTTPError, urllib2.URLError), e: print e exception_write('xiaoqu_chengjiao_spider',xq_name) return except Exception,e: print e exception_write('xiaoqu_chengjiao_spider',xq_name) return content=soup.find('div',{'class':'page-box house-lst-page-box'}) total_pages=0 if content: d="d="+content.get('page-data') exec(d) total_pages=d['totalPage'] threads=[] for i in range(total_pages): url_page=u"http://bj.lianjia.com/chengjiao/pg%drs%s/" % (i+1,urllib2.quote(xq_name)) t=threading.Thread(target=chengjiao_spider,args=(db_cj,url_page)) threads.append(t) for t in threads: t.start() for t in threads: t.join() def do_xiaoqu_chengjiao_spider(db_xq,db_cj): """ 批量爬取小区成交记录 """ count=0 xq_list=db_xq.fetchall() for xq in xq_list: xiaoqu_chengjiao_spider(db_cj,xq[0]) count+=1 print 'have spidered %d xiaoqu' % count print 'done' def exception_write(fun_name,url): """ 写入异常信息到日志 """ lock.acquire() f = open('log.txt','a') line="%s %s\n" % (fun_name,url) f.write(line) f.close() lock.release() def exception_read(): """ 从日志中读取异常信息 """ lock.acquire() f=open('log.txt','r') lines=f.readlines() f.close() f=open('log.txt','w') f.truncate() f.close() lock.release() return lines def exception_spider(db_cj): """ 重新爬取爬取异常的链接 """ count=0 excep_list=exception_read() while excep_list: for excep in excep_list: excep=excep.strip() if excep=="": continue excep_name,url=excep.split(" ",1) if excep_name=="chengjiao_spider": chengjiao_spider(db_cj,url) count+=1 elif excep_name=="xiaoqu_chengjiao_spider": xiaoqu_chengjiao_spider(db_cj,url) count+=1 else: print "wrong format" print "have spidered %d exception url" % count excep_list=exception_read() print 'all done ^_^' if __name__=="__main__": command="create table if not exists xiaoqu (name TEXT primary key UNIQUE, regionb TEXT, regions TEXT, style TEXT, year TEXT)" db_xq=SQLiteWraper('lianjia-xq.db',command) command="create table if not exists chengjiao (href TEXT primary key UNIQUE, name TEXT, style TEXT, area TEXT, orientation TEXT, floor TEXT, year TEXT, sign_time TEXT, unit_price TEXT, total_price TEXT,fangchan_class TEXT, school TEXT, subway TEXT)" db_cj=SQLiteWraper('lianjia-cj.db',command) #爬下所有的小区信息 for region in regions: do_xiaoqu_spider(db_xq,region) #爬下所有小区里的成交信息 do_xiaoqu_chengjiao_spider(db_xq,db_cj) #重新爬取爬取异常的链接 exception_spider(db_cj)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

# -*- coding: utf-8 -*-

"""

@author: 冰蓝

@site: http://lanbing510.info

"""

 

import re

import urllib2  

import sqlite3

import random

import threading

from bs4 import BeautifulSoup

 

import sys

reload(sys)

sys.setdefaultencoding("utf-8")

 

#登录,不登录不能爬取三个月之内的数据

import LianJiaLogIn

 

 

#Some User Agents

hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\

    {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\

    {'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},\

    {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},\

    {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},\

    {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\

    {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\

    {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},\

    {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\

    {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\

    {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},\

    {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},\

    {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]

 

 

#北京区域列表

regions=[u"东城",u"西城",u"朝阳",u"海淀",u"丰台",u"石景山","通州",u"昌平",u"大兴",u"亦庄开发区",u"顺义",u"房山",u"门头沟",u"平谷",u"怀柔",u"密云",u"延庆",u"燕郊"]

 

 

lock = threading.Lock()

 

 

class SQLiteWraper(object):

    """

    数据库的一个小封装,更好的处理多线程写入

    """

    def __init__(self,path,command='',*args,**kwargs):  

        self.lock = threading.RLock() #锁  

        self.path = path #数据库连接参数  

 

        if command!='':

            conn=self.get_conn()

            cu=conn.cursor()

            cu.execute(command)

 

    def get_conn(self):  

        conn = sqlite3.connect(self.path)#,check_same_thread=False)  

        conn.text_factory=str

        return conn  

 

    def conn_close(self,conn=None):  

        conn.close()  

 

    def conn_trans(func):  

        def connection(self,*args,**kwargs):  

            self.lock.acquire()  

            conn = self.get_conn()  

            kwargs['conn'] = conn  

            rs = func(self,*args,**kwargs)  

            self.conn_close(conn)  

            self.lock.release()  

            return rs  

        return connection  

 

    @conn_trans    

    def execute(self,command,method_flag=0,conn=None):  

        cu = conn.cursor()

        try:

            if not method_flag:

                cu.execute(command)

            else:

                cu.execute(command[0],command[1])

            conn.commit()

        except sqlite3.IntegrityError,e:

            #print e

            return -1

        except Exception, e:

            print e

            return -2

        return 0

 

    @conn_trans

    def fetchall(self,command="select name from xiaoqu",conn=None):

        cu=conn.cursor()

        lists=[]

        try:

            cu.execute(command)

            lists=cu.fetchall()

        except Exception,e:

            print e

            pass

        return lists

 

 

def gen_xiaoqu_insert_command(info_dict):

    """

    生成小区数据库插入命令

    """

    info_list=[u'小区名称',u'大区域',u'小区域',u'小区户型',u'建造时间']

    t=[]

    for il in info_list:

        if il in info_dict:

            t.append(info_dict[il])

        else:

            t.append('')

    t=tuple(t)

    command=(r"insert into xiaoqu values(?,?,?,?,?)",t)

    return command

 

 

def gen_chengjiao_insert_command(info_dict):

    """

    生成成交记录数据库插入命令

    """

    info_list=[u'链接',u'小区名称',u'户型',u'面积',u'朝向',u'楼层',u'建造时间',u'签约时间',u'签约单价',u'签约总价',u'房产类型',u'学区',u'地铁']

    t=[]

    for il in info_list:

        if il in info_dict:

            t.append(info_dict[il])

        else:

            t.append('')

    t=tuple(t)

    command=(r"insert into chengjiao values(?,?,?,?,?,?,?,?,?,?,?,?,?)",t)

    return command

 

 

def xiaoqu_spider(db_xq,url_page=u"http://bj.lianjia.com/xiaoqu/pg1rs%E6%98%8C%E5%B9%B3/"):

    """

    爬取页面链接中的小区信息

    """

    try:

        req = urllib2.Request(url_page,headers=hds[random.randint(0,len(hds)-1)])

        source_code = urllib2.urlopen(req,timeout=10).read()

        plain_text=unicode(source_code)#,errors='ignore')  

        soup = BeautifulSoup(plain_text)

    except (urllib2.HTTPError, urllib2.URLError), e:

        print e

        exit(-1)

    except Exception,e:

        print e

        exit(-1)

 

    xiaoqu_list=soup.findAll('div',{'class':'info-panel'})

    for xq in xiaoqu_list:

        info_dict={}

        info_dict.update({u'小区名称':xq.find('a').text})

        content=unicode(xq.find('div',{'class':'con'}).renderContents().strip())

        info=re.match(r".+>(.+)</a>.+>(.+)</a>.+</span>(.+)<span>.+</span>(.+)",content)

        if info:

            info=info.groups()

            info_dict.update({u'大区域':info[0]})

            info_dict.update({u'小区域':info[1]})

            info_dict.update({u'小区户型':info[2]})

            info_dict.update({u'建造时间':info[3][:4]})

        command=gen_xiaoqu_insert_command(info_dict)

        db_xq.execute(command,1)

 

 

def do_xiaoqu_spider(db_xq,region=u"昌平"):

    """

    爬取大区域中的所有小区信息

    """

    url=u"http://bj.lianjia.com/xiaoqu/rs"+region+"/"

    try:

        req = urllib2.Request(url,headers=hds[random.randint(0,len(hds)-1)])

        source_code = urllib2.urlopen(req,timeout=5).read()

        plain_text=unicode(source_code)#,errors='ignore')  

        soup = BeautifulSoup(plain_text)

    except (urllib2.HTTPError, urllib2.URLError), e:

        print e

        return

    except Exception,e:

        print e

        return

    d="d="+soup.find('div',{'class':'page-box house-lst-page-box'}).get('page-data')

    exec(d)

    total_pages=d['totalPage']

 

    threads=[]

    for i in range(total_pages):

        url_page=u"http://bj.lianjia.com/xiaoqu/pg%drs%s/" % (i+1,region)

        t=threading.Thread(target=xiaoqu_spider,args=(db_xq,url_page))

        threads.append(t)

    for t in threads:

        t.start()

    for t in threads:

        t.join()

    print u"爬下了 %s 区全部的小区信息" % region

 

 

def chengjiao_spider(db_cj,url_page=u"http://bj.lianjia.com/chengjiao/pg1rs%E5%86%A0%E5%BA%AD%E5%9B%AD"):

    """

    爬取页面链接中的成交记录

    """

    try:

        req = urllib2.Request(url_page,headers=hds[random.randint(0,len(hds)-1)])

        source_code = urllib2.urlopen(req,timeout=10).read()

        plain_text=unicode(source_code)#,errors='ignore')  

        soup = BeautifulSoup(plain_text)

    except (urllib2.HTTPError, urllib2.URLError), e:

        print e

        exception_write('chengjiao_spider',url_page)

        return

    except Exception,e:

        print e

        exception_write('chengjiao_spider',url_page)

        return

 

    cj_list=soup.findAll('div',{'class':'info-panel'})

    for cj in cj_list:

        info_dict={}

        href=cj.find('a')

        if not href:

            continue

        info_dict.update({u'链接':href.attrs['href']})

        content=cj.find('h2').text.split()

        if content:

            info_dict.update({u'小区名称':content[0]})

            info_dict.update({u'户型':content[1]})

            info_dict.update({u'面积':content[2]})

        content=unicode(cj.find('div',{'class':'con'}).renderContents().strip())

        content=content.split('/')

        if content:

            info_dict.update({u'朝向':content[0].strip()})

            info_dict.update({u'楼层':content[1].strip()})

            if len(content)>=3:

                content[2]=content[2].strip();

                info_dict.update({u'建造时间':content[2][:4]})

        content=cj.findAll('div',{'class':'div-cun'})

        if content:

            info_dict.update({u'签约时间':content[0].text})

            info_dict.update({u'签约单价':content[1].text})

            info_dict.update({u'签约总价':content[2].text})

        content=cj.find('div',{'class':'introduce'}).text.strip().split()

        if content:

            for c in content:

                if c.find(u'满')!=-1:

                    info_dict.update({u'房产类型':c})

                elif c.find(u'学')!=-1:

                    info_dict.update({u'学区':c})

                elif c.find(u'距')!=-1:

                    info_dict.update({u'地铁':c})

 

        command=gen_chengjiao_insert_command(info_dict)

        db_cj.execute(command,1)

 

 

def xiaoqu_chengjiao_spider(db_cj,xq_name=u"冠庭园"):

    """

    爬取小区成交记录

    """

    url=u"http://bj.lianjia.com/chengjiao/rs"+urllib2.quote(xq_name)+"/"

    try:

        req = urllib2.Request(url,headers=hds[random.randint(0,len(hds)-1)])

        source_code = urllib2.urlopen(req,timeout=10).read()

        plain_text=unicode(source_code)#,errors='ignore')  

        soup = BeautifulSoup(plain_text)

    except (urllib2.HTTPError, urllib2.URLError), e:

        print e

        exception_write('xiaoqu_chengjiao_spider',xq_name)

        return

    except Exception,e:

        print e

        exception_write('xiaoqu_chengjiao_spider',xq_name)

        return

    content=soup.find('div',{'class':'page-box house-lst-page-box'})

    total_pages=0

    if content:

        d="d="+content.get('page-data')

        exec(d)

        total_pages=d['totalPage']

 

    threads=[]

    for i in range(total_pages):

        url_page=u"http://bj.lianjia.com/chengjiao/pg%drs%s/" % (i+1,urllib2.quote(xq_name))

        t=threading.Thread(target=chengjiao_spider,args=(db_cj,url_page))

        threads.append(t)

    for t in threads:

        t.start()

    for t in threads:

        t.join()

 

 

def do_xiaoqu_chengjiao_spider(db_xq,db_cj):

    """

    批量爬取小区成交记录

    """

    count=0

    xq_list=db_xq.fetchall()

    for xq in xq_list:

        xiaoqu_chengjiao_spider(db_cj,xq[0])

        count+=1

        print 'have spidered %d xiaoqu' % count

    print 'done'

 

 

def exception_write(fun_name,url):

    """

    写入异常信息到日志

    """

    lock.acquire()

    f = open('log.txt','a')

    line="%s %s\n" % (fun_name,url)

    f.write(line)

    f.close()

    lock.release()

 

 

def exception_read():

    """

    从日志中读取异常信息

    """

    lock.acquire()

    f=open('log.txt','r')

    lines=f.readlines()

    f.close()

    f=open('log.txt','w')

    f.truncate()

    f.close()

    lock.release()

    return lines

 

 

def exception_spider(db_cj):

    """

    重新爬取爬取异常的链接

    """

    count=0

    excep_list=exception_read()

    while excep_list:

        for excep in excep_list:

            excep=excep.strip()

            if excep=="":

                continue

            excep_name,url=excep.split(" ",1)

            if excep_name=="chengjiao_spider":

                chengjiao_spider(db_cj,url)

                count+=1

            elif excep_name=="xiaoqu_chengjiao_spider":

                xiaoqu_chengjiao_spider(db_cj,url)

                count+=1

            else:

                print "wrong format"

            print "have spidered %d exception url" % count

        excep_list=exception_read()

    print 'all done ^_^'

 

 

 

if __name__=="__main__":

    command="create table if not exists xiaoqu (name TEXT primary key UNIQUE, regionb TEXT, regions TEXT, style TEXT, year TEXT)"

    db_xq=SQLiteWraper('lianjia-xq.db',command)

 

    command="create table if not exists chengjiao (href TEXT primary key UNIQUE, name TEXT, style TEXT, area TEXT, orientation TEXT, floor TEXT, year TEXT, sign_time TEXT, unit_price TEXT, total_price TEXT,fangchan_class TEXT, school TEXT, subway TEXT)"

    db_cj=SQLiteWraper('lianjia-cj.db',command)

 

    #爬下所有的小区信息

    for region in regions:

        do_xiaoqu_spider(db_xq,region)

 

    #爬下所有小区里的成交信息

    do_xiaoqu_chengjiao_spider(db_xq,db_cj)

 

    #重新爬取爬取异常的链接

    exception_spider(db_cj)

 

阿布云高速代理IP,分布式动态代理IP,高质量IP代理,全国高匿代理ip,爬虫代理,私密代理IP,国内极速代理IP,优质代理IP
   https://www.abuyun.com

文章转载:https://geekspider.org