2018 Scrapy Environment Enhance(3)Docker ENV
2018ScrapyEnvironmentEnhance(3)DockerENV
SetUpScrapyUbuntuDEV
>sudoapt-getinstall-qypythonpython-devpython-distributepython-pipipython
>sudoapt-getinstall-qyfirefoxxvfb
>sudoapt-getinstall-qylibffi-devlibxml2-devlibxslt-devlib32z1-devlibssl-dev
>sudoapt-getinstallpython3-venv
>sudoapt-getinstallpython3-dev
>sudoaptinstallunzip
>sudoapt-getinstalllibxi6libgconf-2-4
>sudoapt-getinstalllibnss3libgconf-2-4
>sudoapt-getinstallchromium-browser
Ifneed,makeittorememberthegitusernameandpassword
>gitconfigcredential.helper'cache--timeout=300000'
CreatethevirtualENVandactivatethat
>python3-mvenv./env
>source./env/bin/activate
>pipinstall--upgradepip
>pipinstallseleniumpyvirtualdisplay
>pipinstallboto3
>pipinstallbeautifulsoup4requests
InstallTwisted
>wgethttp://twistedmatrix.com/Releases/Twisted/17.9/Twisted-17.9.0.tar.bz2
>tarxjfTwisted-17.9.0.tar.bz2
>pythonsetup.pyinstall
>pipinstalllxmlscrapyscrapyjs
InstallBrowserandDriver
>wgethttps://chromedriver.storage.googleapis.com/2.37/chromedriver_linux64.zip
>unzipchromedriver_linux64.zip
>chmoda+xchromedriver
>sudomvchromedriver/usr/local/bin/
>chromedriver--version
ChromeDriver2.37.544315(730aa6a5fdba159ac9f4c1e8cbc59bf1b5ce12b7)
>chromium-browser-version
Chromium65.0.3325.181BuiltonUbuntu,runningonUbuntu16.04
SetupTorNetworkProxy
>sudoapt-getinstalltor
>sudoapt-getinstallnetcat
>sudoapt-getinstallcurl
>sudoapt-getinstallprivoxy
CheckmyLocalIP
>curlhttp://icanhazip.com/
52.14.197.xxx
SetUpTor
>tor--hash-passwordprxxxxxxxx
16:01D5D02xxxxxxxxxxxxxxxxxxxxxxxxxxx
>cat/etc/tor/torrc
ControlPort9051
>cat/etc/tor/torrcpassword
HashedControlPassword16:01D5D02EFA3D6A5xxxxxxxxxxxxxxxxxxx
StartTor
>sudoservicetorstart
VerifyitchangemyIP
>torifycurlhttp://icanhazip.com/
192.36.27.4
Commanddoesnotworkhere
>echo-e'AUTHENTICATE"pricemonitor1234"\r\nsignalNEWNYM\r\nQUIT'|nc127.0.0.19051
TrytousePythontochangetheIP
>pipinstallstem
>python
Python3.5.2(default,Nov232017,16:37:01)
[GCC5.4.020160609]onlinux
Type"help","copyright","credits"or"license"formoreinformation.
>>>
>>>fromstemimportSignal
>>>fromstem.controlimportController
>>>withController.from_port(port=9051)ascontroller:
...controller.authenticate()
...controller.signal(Signal.NEWNYM)
...
Thatshouldworkifthepermissionisright.
ConfigtheProxy
>cat/etc/privoxy/config
forward-socks5t/127.0.0.1:9050.
StarttheService
>sudoserviceprivoxystart
VerifytheIP
>curl-x127.0.0.1:8118http://icanhazip.com/
185.220.101.6
VerifywithRequestAPI
>python
Python3.5.2(default,Nov232017,16:37:01)
[GCC5.4.020160609]onlinux
Type"help","copyright","credits"or"license"formoreinformation.
>>>
>>>
>>>importrequests
>>>response=requests.get('http://icanhazip.com/',proxies={'http':'127.0.0.1:8118'})
>>>response.text.strip()
'185.220.101.6'
ThinkAboutDockerApplication
Dockerfile
#Runascrapyserverside
#PreparetheOS
FROMubuntu:16.04
MAINTAINERCarlLuo<[email protected]>
ENVDEBIAN_FRONTENDnoninteractive
RUNapt-get-qqupdate
RUNapt-get-qqydist-upgrade
#Preparethedenpendencies
RUNapt-getinstall-qypython3python3-devpython-distributepython3-pipipython
RUNapt-getinstall-qyfirefoxxvfb
RUNpip3installseleniumpyvirtualdisplay
RUNpip3installboto3beautifulsoup4requests
RUNapt-getinstall-qylibffi-devlibxml2-devlibxslt-devlib32z1-devlibssl-dev
RUNpip3installlxmlscrapyscrapyjs
RUNpip3install--upgradepip
RUNapt-getinstall-qypython3-venv
RUNapt-getinstall-qylibxi6libgconf-2-4libnss3libgconf-2-4
RUNapt-getinstall-qychromium-browser
RUNapt-getinstall-qywgetunzipgit
#addtool
ADDinstall/chromedriver/usr/local/bin/
RUNpipinstallscrapyd
#copytheconfig
RUNmkdir-p/tool/scrapyd/
ADDconf/scrapyd.conf/tool/scrapyd/
#setuptheapp
EXPOSE6801
RUNmkdir-p/app/
ADDstart.sh/app/
WORKDIR/app/
CMD["./start.sh"]
Makefile
IMAGE=sillycat/public
TAG=ubuntu-scrapy-1.0
NAME=ubuntu-scrapy-1.0
docker-context:
build:docker-context
dockerbuild-t$(IMAGE):$(TAG).
run:
dockerrun-d-p6801:6801--name$(NAME)$(IMAGE):$(TAG)
debug:
dockerrun-p6801:6801--name$(NAME)-ti$(IMAGE):$(TAG)/bin/bash
clean:
dockerstop${NAME}
dockerrm${NAME}
logs:
dockerlogs${NAME}
publish:
dockerpush${IMAGE}
start.sh
#!/bin/sh-ex
#starttheservice
cd/tool/scrapyd/
scrapyd
Configurationinconf/scrapyd.conf
[scrapyd]
eggs_dir=eggs
logs_dir=logs
items_dir=
jobs_to_keep=100
dbs_dir=dbs
max_proc=0
max_proc_per_cpu=20
finished_to_keep=100
poll_interval=5.0
bind_address=0.0.0.0
http_port=6801
debug=off
runner=scrapyd.runner
application=scrapyd.app.application
launcher=scrapyd.launcher.Launcher
webroot=scrapyd.website.Root
[services]
schedule.json=scrapyd.webservice.Schedule
cancel.json=scrapyd.webservice.Cancel
addversion.json=scrapyd.webservice.AddVersion
listprojects.json=scrapyd.webservice.ListProjects
listversions.json=scrapyd.webservice.ListVersions
listspiders.json=scrapyd.webservice.ListSpiders
delproject.json=scrapyd.webservice.DeleteProject
delversion.json=scrapyd.webservice.DeleteVersion
listjobs.json=scrapyd.webservice.ListJobs
daemonstatus.json=scrapyd.webservice.DaemonStatus
References: