Browse Source

9/18

master
Wang0018 5 years ago
parent
commit
d7e720352b
  1. 17
      .idea/jsTestFinder.xml
  2. 264
      .idea/workspace.xml
  3. BIN
      all_demo1.egg
  4. 223
      demo1/Util.py
  5. BIN
      demo1/__pycache__/Util.cpython-37.pyc
  6. BIN
      demo1/__pycache__/custom_settings_conf.cpython-37.pyc
  7. BIN
      demo1/__pycache__/items.cpython-37.pyc
  8. BIN
      demo1/__pycache__/middlewares.cpython-37.pyc
  9. BIN
      demo1/__pycache__/pipelines.cpython-37.pyc
  10. BIN
      demo1/__pycache__/settings.cpython-37.pyc
  11. 11
      demo1/crawl.py
  12. 390
      demo1/custom_settings_conf.py
  13. 2
      demo1/items.py
  14. 964
      demo1/logs/chacewang_2020_9.log
  15. 413
      demo1/logs/fagaiwei_2020_9.log
  16. 208
      demo1/logs/fazhancujinju_2020_9.log
  17. 1549
      demo1/logs/gongyehexinxihuabu_2020_9.log
  18. 560
      demo1/logs/huojuzhongxin_2020_9.log
  19. 321
      demo1/logs/kexujishubu_2020_9.log
  20. 1612
      demo1/logs/qicetong_2020_9.log
  21. 104
      demo1/logs/sxfagaiwei_2020_9.log
  22. 101
      demo1/logs/sxgongxinting_2020_9.log
  23. 106
      demo1/logs/sxkejiting_2020_9.log
  24. 110
      demo1/logs/sxshangwuting_2020_9.log
  25. 1305
      demo1/logs/sxzonggaishifanqu_2020_9.log
  26. 102
      demo1/logs/taiyuangongyehexinxihuaju_2020_9.log
  27. 203
      demo1/logs/taiyuankjj_2020_9.log
  28. 1985
      demo1/logs/taiyuanshangwuju_2020_9.log
  29. 106
      demo1/logs/wenhuahelvyoubu_2020_9.log
  30. 110
      demo1/logs/zhongxiaoqiyezongju_2020_9.log
  31. 142
      demo1/logs/ziranweiyuanhui_2020_9.log
  32. 19
      demo1/main.py
  33. 207
      demo1/pipelines.py
  34. 30
      demo1/settings.py
  35. BIN
      demo1/spiders/__pycache__/chacewangSpider.cpython-37.pyc
  36. BIN
      demo1/spiders/__pycache__/fagaiweiSpider.cpython-37.pyc
  37. BIN
      demo1/spiders/__pycache__/gongyehexinxihuabuSpider.cpython-37.pyc
  38. BIN
      demo1/spiders/__pycache__/huojuzhongxinSpider.cpython-37.pyc
  39. BIN
      demo1/spiders/__pycache__/kexujishubuSpider.cpython-37.pyc
  40. BIN
      demo1/spiders/__pycache__/qicetongSpider.cpython-37.pyc
  41. BIN
      demo1/spiders/__pycache__/shanxifagaiwei.cpython-37.pyc
  42. BIN
      demo1/spiders/__pycache__/shanxigongxintingSpider.cpython-37.pyc
  43. BIN
      demo1/spiders/__pycache__/shanxishengkejitingSpider.cpython-37.pyc
  44. BIN
      demo1/spiders/__pycache__/shanxishengshangwutingSpider.cpython-37.pyc
  45. BIN
      demo1/spiders/__pycache__/shanxixiaoqiyecujinjuSpider.cpython-37.pyc
  46. BIN
      demo1/spiders/__pycache__/shanxizonggaiquSpider.cpython-37.pyc
  47. BIN
      demo1/spiders/__pycache__/taiyuangongyehexinxihuajuSpider.cpython-37.pyc
  48. BIN
      demo1/spiders/__pycache__/taiyuanshangwujuSpider.cpython-37.pyc
  49. BIN
      demo1/spiders/__pycache__/taiyuanshikexujishujuSpider.cpython-37.pyc
  50. BIN
      demo1/spiders/__pycache__/wenhuahelvyoubuSpider.cpython-37.pyc
  51. BIN
      demo1/spiders/__pycache__/zhongxiaoqiyejuSpider.cpython-37.pyc
  52. BIN
      demo1/spiders/__pycache__/ziranweiyuanhuiSpider.cpython-37.pyc
  53. 16
      demo1/spiders/chacewangSpider.py
  54. 83
      demo1/spiders/fagaiweiSpider.py
  55. 112
      demo1/spiders/gongyehexinxihuabuSpider.py
  56. 74
      demo1/spiders/huojuzhongxinSpider.py
  57. 136
      demo1/spiders/kexujishubuSpider.py
  58. 74
      demo1/spiders/qicetongSpider.py
  59. 72
      demo1/spiders/shanxifagaiwei.py
  60. 89
      demo1/spiders/shanxigongxintingSpider.py
  61. 96
      demo1/spiders/shanxishengkejitingSpider.py
  62. 183
      demo1/spiders/shanxishengshangwutingSpider.py
  63. 80
      demo1/spiders/shanxixiaoqiyecujinjuSpider.py
  64. 86
      demo1/spiders/shanxizonggaiquSpider.py
  65. 97
      demo1/spiders/taiyuangongyehexinxihuajuSpider.py
  66. 97
      demo1/spiders/taiyuanshangwujuSpider.py
  67. 92
      demo1/spiders/taiyuanshikexujishujuSpider.py
  68. 86
      demo1/spiders/wenhuahelvyoubuSpider.py
  69. 75
      demo1/spiders/zhongxiaoqiyejuSpider.py
  70. 73
      demo1/spiders/ziranweiyuanhuiSpider.py
  71. BIN
      noall_demo1.egg
  72. 3
      scrapy.cfg

17
.idea/jsTestFinder.xml

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavascriptTestFinderProjectComponent">
<option name="productionExtensions">
<list>
<option value=".js" />
<option value=".jsx" />
</list>
</option>
<option name="testExtensions">
<list>
<option value=".spec.js" />
<option value=".jstd" />
</list>
</option>
</component>
</project>

264
.idea/workspace.xml

@ -43,16 +43,39 @@
<component name="PropertiesComponent">
<property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
<property name="WebServerToolWindowFactoryState" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="last_opened_file_path" value="$PROJECT_DIR$/../../pythonwork" />
<property name="nodejs_package_manager_path" value="npm" />
<property name="settings.editor.selected.configurable" value="preferences.lookFeel" />
</component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="C:\e\scrapywork\demo1\demo1\spiders" />
<recent name="C:\e\scrapywork\demo1" />
</key>
</component>
<component name="RunManager" selected="Python.main">
<configuration name="chacewangSpider1" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="demo1" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/demo1/spiders" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/demo1/spiders/chacewangSpider1.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="main" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="demo1" />
<option name="INTERPRETER_OPTIONS" value="" />
@ -100,6 +123,7 @@
<recent_temporary>
<list>
<item itemvalue="Python.main" />
<item itemvalue="Python.chacewangSpider1" />
<item itemvalue="Python.test" />
</list>
</recent_temporary>
@ -138,81 +162,134 @@
<workItem from="1595465883453" duration="19514000" />
<workItem from="1595639285883" duration="10719000" />
<workItem from="1595668032999" duration="2304000" />
<workItem from="1595809685678" duration="10984000" />
<workItem from="1595830914377" duration="3592000" />
<workItem from="1596014256234" duration="3030000" />
<workItem from="1596069123364" duration="10704000" />
<workItem from="1597278648032" duration="24716000" />
<workItem from="1597365130062" duration="21936000" />
<workItem from="1597624689484" duration="27989000" />
<workItem from="1597712471622" duration="24590000" />
<workItem from="1597797107226" duration="26280000" />
<workItem from="1597883267831" duration="26072000" />
<workItem from="1597969936970" duration="24616000" />
<workItem from="1598057627947" duration="11406000" />
<workItem from="1598073066889" duration="14855000" />
<workItem from="1598229456278" duration="24498000" />
<workItem from="1598316326964" duration="21337000" />
<workItem from="1598402072100" duration="23819000" />
<workItem from="1598488750513" duration="18987000" />
<workItem from="1598574468818" duration="239000" />
<workItem from="1598574913910" duration="25326000" />
<workItem from="1598834610244" duration="1171000" />
<workItem from="1598836167637" duration="22281000" />
<workItem from="1598920559303" duration="2276000" />
<workItem from="1598923384898" duration="1834000" />
<workItem from="1599006704009" duration="19350000" />
<workItem from="1599093554833" duration="17209000" />
<workItem from="1599179395410" duration="23181000" />
<workItem from="1599267498360" duration="8675000" />
<workItem from="1599439636816" duration="24167000" />
<workItem from="1599524287250" duration="502000" />
<workItem from="1599527234086" duration="13400000" />
<workItem from="1599611681342" duration="27322000" />
<workItem from="1599699388630" duration="12910000" />
<workItem from="1599783922692" duration="3087000" />
<workItem from="1600044044779" duration="1939000" />
<workItem from="1600129441000" duration="5512000" />
<workItem from="1600216786769" duration="5562000" />
<workItem from="1600303566749" duration="11863000" />
<workItem from="1600389231570" duration="7311000" />
</task>
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="1" />
</component>
<component name="UnknownFeatures">
<option featureType="com.intellij.fileTypeFactory" implementationName="*.log" />
</component>
<component name="WindowStateProjectService">
<state x="621" y="337" key="#com.intellij.fileTypes.FileTypeChooser" timestamp="1595668232369">
<state x="621" y="337" key="#com.intellij.fileTypes.FileTypeChooser" timestamp="1596069848327">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state x="621" y="337" key="#com.intellij.fileTypes.FileTypeChooser/0.0.2048.1112@0.0.2048.1112" timestamp="1595668232369" />
<state x="621" y="337" key="#com.intellij.fileTypes.FileTypeChooser/0.0.2048.1112@0.0.2048.1112" timestamp="1596069848327" />
<state x="630" y="189" key="#com.intellij.refactoring.safeDelete.UnsafeUsagesDialog" timestamp="1595669950835">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state x="630" y="189" key="#com.intellij.refactoring.safeDelete.UnsafeUsagesDialog/0.0.2048.1112@0.0.2048.1112" timestamp="1595669950835" />
<state width="1145" height="657" key="DebuggerActiveHint" timestamp="1594001507973">
<state x="667" y="131" key="#xdebugger.evaluate" timestamp="1599639939637">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state x="667" y="131" key="#xdebugger.evaluate/0.0.2048.1112@0.0.2048.1112" timestamp="1599639939637" />
<state width="1323" height="657" key="DebuggerActiveHint" timestamp="1597970555721">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state width="1323" height="657" key="DebuggerActiveHint/0.0.2048.1112@0.0.2048.1112" timestamp="1597970555721" />
<state x="93" y="93" width="1862" height="926" key="DiffContextDialog" timestamp="1599289107195">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state x="93" y="93" width="1862" height="926" key="DiffContextDialog/0.0.2048.1112@0.0.2048.1112" timestamp="1599289107195" />
<state x="705" y="200" key="FileChooserDialogImpl" timestamp="1600392310544">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state width="1145" height="657" key="DebuggerActiveHint/0.0.2048.1112@0.0.2048.1112" timestamp="1594001507973" />
<state x="705" y="200" key="FileChooserDialogImpl" timestamp="1595666219299">
<state x="705" y="200" key="FileChooserDialogImpl/0.0.2048.1112@0.0.2048.1112" timestamp="1600392310544" />
<state width="1987" height="273" key="GridCell.Tab.0.bottom" timestamp="1600396548662">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state x="705" y="200" key="FileChooserDialogImpl/0.0.2048.1112@0.0.2048.1112" timestamp="1595666219299" />
<state width="1987" height="239" key="GridCell.Tab.0.bottom" timestamp="1595667096857">
<state width="1987" height="273" key="GridCell.Tab.0.bottom/0.0.2048.1112@0.0.2048.1112" timestamp="1600396548662" />
<state width="1987" height="273" key="GridCell.Tab.0.center" timestamp="1600396548661">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state width="1987" height="239" key="GridCell.Tab.0.bottom/0.0.2048.1112@0.0.2048.1112" timestamp="1595667096857" />
<state width="1987" height="239" key="GridCell.Tab.0.center" timestamp="1595667096857">
<state width="1987" height="273" key="GridCell.Tab.0.center/0.0.2048.1112@0.0.2048.1112" timestamp="1600396548661" />
<state width="1987" height="273" key="GridCell.Tab.0.left" timestamp="1600396548661">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state width="1987" height="239" key="GridCell.Tab.0.center/0.0.2048.1112@0.0.2048.1112" timestamp="1595667096857" />
<state width="1987" height="239" key="GridCell.Tab.0.left" timestamp="1595667096857">
<state width="1987" height="273" key="GridCell.Tab.0.left/0.0.2048.1112@0.0.2048.1112" timestamp="1600396548661" />
<state width="1987" height="273" key="GridCell.Tab.0.right" timestamp="1600396548661">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state width="1987" height="239" key="GridCell.Tab.0.left/0.0.2048.1112@0.0.2048.1112" timestamp="1595667096857" />
<state width="1987" height="239" key="GridCell.Tab.0.right" timestamp="1595667096857">
<state width="1987" height="273" key="GridCell.Tab.0.right/0.0.2048.1112@0.0.2048.1112" timestamp="1600396548661" />
<state width="1987" height="345" key="GridCell.Tab.1.bottom" timestamp="1600395665879">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state width="1987" height="239" key="GridCell.Tab.0.right/0.0.2048.1112@0.0.2048.1112" timestamp="1595667096857" />
<state width="1987" height="274" key="GridCell.Tab.1.bottom" timestamp="1595412145071">
<state width="1987" height="345" key="GridCell.Tab.1.bottom/0.0.2048.1112@0.0.2048.1112" timestamp="1600395665879" />
<state width="1987" height="345" key="GridCell.Tab.1.center" timestamp="1600395665878">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state width="1987" height="274" key="GridCell.Tab.1.bottom/0.0.2048.1112@0.0.2048.1112" timestamp="1595412145071" />
<state width="1987" height="274" key="GridCell.Tab.1.center" timestamp="1595412145071">
<state width="1987" height="345" key="GridCell.Tab.1.center/0.0.2048.1112@0.0.2048.1112" timestamp="1600395665878" />
<state width="1987" height="345" key="GridCell.Tab.1.left" timestamp="1600395665878">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state width="1987" height="274" key="GridCell.Tab.1.center/0.0.2048.1112@0.0.2048.1112" timestamp="1595412145071" />
<state width="1987" height="274" key="GridCell.Tab.1.left" timestamp="1595412145071">
<state width="1987" height="345" key="GridCell.Tab.1.left/0.0.2048.1112@0.0.2048.1112" timestamp="1600395665878" />
<state width="1987" height="345" key="GridCell.Tab.1.right" timestamp="1600395665879">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state width="1987" height="274" key="GridCell.Tab.1.left/0.0.2048.1112@0.0.2048.1112" timestamp="1595412145071" />
<state width="1987" height="274" key="GridCell.Tab.1.right" timestamp="1595412145071">
<state width="1987" height="345" key="GridCell.Tab.1.right/0.0.2048.1112@0.0.2048.1112" timestamp="1600395665879" />
<state x="220" y="55" key="SettingsEditor" timestamp="1598229496462">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state width="1987" height="274" key="GridCell.Tab.1.right/0.0.2048.1112@0.0.2048.1112" timestamp="1595412145071" />
<state x="220" y="55" key="SettingsEditor" timestamp="1595379474202">
<state x="220" y="55" key="SettingsEditor/0.0.2048.1112@0.0.2048.1112" timestamp="1598229496462" />
<state x="610" y="172" width="810" height="786" key="StructurePopup" timestamp="1598232790302">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state x="220" y="55" key="SettingsEditor/0.0.2048.1112@0.0.2048.1112" timestamp="1595379474202" />
<state width="1971" height="636" key="XDebugger.FullValuePopup" timestamp="1595387305358">
<state x="610" y="172" width="810" height="786" key="StructurePopup/0.0.2048.1112@0.0.2048.1112" timestamp="1598232790302" />
<state width="1214" height="590" key="XDebugger.FullValuePopup" timestamp="1600394863845">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state width="1971" height="636" key="XDebugger.FullValuePopup/0.0.2048.1112@0.0.2048.1112" timestamp="1595387305358" />
<state width="1214" height="590" key="XDebugger.FullValuePopup/0.0.2048.1112@0.0.2048.1112" timestamp="1600394863845" />
<state x="369" y="171" key="com.intellij.xdebugger.impl.breakpoints.ui.BreakpointsDialogFactory$2" timestamp="1594281494706">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state x="369" y="171" key="com.intellij.xdebugger.impl.breakpoints.ui.BreakpointsDialogFactory$2/0.0.2048.1112@0.0.2048.1112" timestamp="1594281494706" />
<state x="1084" y="11" width="964" height="1101" key="find.popup" timestamp="1594089126864">
<state x="1084" y="11" width="964" height="1101" key="find.popup" timestamp="1600392005965">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state x="1084" y="11" width="964" height="1101" key="find.popup/0.0.2048.1112@0.0.2048.1112" timestamp="1594089126864" />
<state x="517" y="0" width="1009" height="1108" key="search.everywhere.popup" timestamp="1594779833720">
<state x="1084" y="11" width="964" height="1101" key="find.popup/0.0.2048.1112@0.0.2048.1112" timestamp="1600392005965" />
<state x="517" y="0" width="1009" height="1108" key="search.everywhere.popup" timestamp="1598863733612">
<screen x="0" y="0" width="2048" height="1112" />
</state>
<state x="517" y="0" width="1009" height="1108" key="search.everywhere.popup/0.0.2048.1112@0.0.2048.1112" timestamp="1594779833720" />
<state x="517" y="0" width="1009" height="1108" key="search.everywhere.popup/0.0.2048.1112@0.0.2048.1112" timestamp="1598863733612" />
<state x="780" y="249" key="svn.repositoryBrowser" timestamp="1595325296659">
<screen x="0" y="0" width="2048" height="1112" />
</state>
@ -223,38 +300,123 @@
<breakpoints>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/pipelines.py</url>
<line>262</line>
<option name="timeStamp" value="160" />
<line>247</line>
<option name="timeStamp" value="267" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/pipelines.py</url>
<line>273</line>
<option name="timeStamp" value="268" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/pipelines.py</url>
<line>209</line>
<option name="timeStamp" value="161" />
<line>278</line>
<option name="timeStamp" value="269" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/pipelines.py</url>
<line>268</line>
<option name="timeStamp" value="166" />
<line>206</line>
<option name="timeStamp" value="271" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/pipelines.py</url>
<line>221</line>
<option name="timeStamp" value="167" />
<line>227</line>
<option name="timeStamp" value="272" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/chacewangSpider.py</url>
<line>102</line>
<option name="timeStamp" value="173" />
<url>file://$PROJECT_DIR$/demo1/pipelines.py</url>
<line>381</line>
<option name="timeStamp" value="277" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/ziranweiyuanhuiSpider.py</url>
<line>50</line>
<option name="timeStamp" value="294" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/chacewangSpider.py</url>
<line>111</line>
<option name="timeStamp" value="180" />
<url>file://$PROJECT_DIR$/demo1/spiders/huojuzhongxinSpider.py</url>
<line>52</line>
<option name="timeStamp" value="304" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/chacewangSpider.py</url>
<line>107</line>
<option name="timeStamp" value="181" />
<url>file://$PROJECT_DIR$/demo1/spiders/zhongxiaoqiyejuSpider.py</url>
<line>69</line>
<option name="timeStamp" value="331" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/shanxishengshangwutingSpider.py</url>
<line>86</line>
<option name="timeStamp" value="341" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/shanxishengshangwutingSpider.py</url>
<line>87</line>
<option name="timeStamp" value="342" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/shanxigongxintingSpider.py</url>
<line>64</line>
<option name="timeStamp" value="353" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/shanxizonggaiquSpider.py</url>
<line>64</line>
<option name="timeStamp" value="358" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/shanxifagaiwei.py</url>
<line>67</line>
<option name="timeStamp" value="363" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/shanxifagaiwei.py</url>
<line>64</line>
<option name="timeStamp" value="369" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/taiyuanshikexujishujuSpider.py</url>
<line>73</line>
<option name="timeStamp" value="374" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/taiyuanshikexujishujuSpider.py</url>
<line>64</line>
<option name="timeStamp" value="375" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/taiyuangongyehexinxihuajuSpider.py</url>
<line>77</line>
<option name="timeStamp" value="379" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/taiyuangongyehexinxihuajuSpider.py</url>
<line>75</line>
<option name="timeStamp" value="380" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/taiyuangongyehexinxihuajuSpider.py</url>
<line>40</line>
<option name="timeStamp" value="381" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/taiyuanshangwujuSpider.py</url>
<line>47</line>
<option name="timeStamp" value="382" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/spiders/shanxixiaoqiyecujinjuSpider.py</url>
<line>40</line>
<option name="timeStamp" value="383" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/pipelines.py</url>
<line>335</line>
<option name="timeStamp" value="390" />
</line-breakpoint>
<line-breakpoint enabled="true" type="python-line">
<url>file://$PROJECT_DIR$/demo1/pipelines.py</url>
<line>323</line>
<option name="timeStamp" value="391" />
</line-breakpoint>
</breakpoints>
<breakpoints-defaults>
@ -265,12 +427,14 @@
<configuration name="PythonConfigurationType">
<watch expression="j" />
<watch expression="response" />
<watch expression="response" />
<watch expression="response" language="Python" />
<watch expression="item" />
</configuration>
</watches-manager>
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/demo1$chacewangSpider1.coverage" NAME="chacewangSpider1 Coverage Results" MODIFIED="1597805689998" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/demo1/spiders" />
<SUITE FILE_PATH="coverage/demo1$test.coverage" NAME="test Coverage Results" MODIFIED="1594782961016" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/demo1" />
<SUITE FILE_PATH="coverage/demo1$main.coverage" NAME="main Coverage Results" MODIFIED="1595639349291" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/demo1" />
<SUITE FILE_PATH="coverage/demo1$main.coverage" NAME="main Coverage Results" MODIFIED="1600396548656" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/demo1" />
</component>
</project>

BIN
all_demo1.egg

Binary file not shown.

223
demo1/Util.py

@ -0,0 +1,223 @@
from uuid import uuid4
import datetime
from lxml import etree
import logging
import pymysql
from abc import ABCMeta,abstractmethod
from twisted.enterprise import adbapi
import copy
class Util_WANG(metaclass=ABCMeta):
@classmethod
def pos_url(cls,item,settings,response=None):
"""
判断我们的链接是否为那种直接需要打开的pdfimage等这种的
:param item:原文item
:param settings: 把那个settings传进来
:param response: 请求response
:return: 最好是直接返回是否为需要下载的需要的返回true不需要的返回false
"""
houzui=item['lianjie'][item['lianjie'].rfind('.'):].strip()
jiewei = ['.doc', '.xls', '.docx', '.xlsx', '.txt', '.rar', '.zip', '.wps', '.pdf','.jpg', '.png', '.jpeg', '.gif', '.svg']
s=False
for jiewei_sign in jiewei:
if item['lianjie'].endswith(jiewei_sign):
s=True
break
if s:
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
item['wenjian'] = [{'file_name': '原文件'}]
item['wenjian'][0]['file_url'] = item['lianjie']
houzui = item['wenjian'][0]['file_url'][item['wenjian'][0]['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui
item['wenjian'][0]['new_file'] = new_url
item['xiangqing'] = '<div><p>请查看原文附件:<a href="' + settings.get(
'FILE_PATH') + new_url + '">原文件</a></p></div>'
return s
@classmethod
def jiewei_href_contains(cls):
"""
返回我们的文件的后缀
:return:
"""
str = ''
jiewei = ['.doc', '.xls', '.docx', '.xlsx', '.txt', '.rar', '.zip', '.wps', '.pdf']
for j in jiewei:
str += 'contains(@href,"' + j + '")' + ' or '
str = str.strip().strip('or').strip()
return str
@classmethod
def jiewei_src_contains(cls):
"""
返回我们图片文件
:return:
"""
str = ''
jiewei = ['.jpg', '.png', '.jpeg', '.gif', '.svg']
for j in jiewei:
str += 'contains(@src,"' + j + '")' + ' or '
str = str.strip().strip('or').strip()
return str
@classmethod
def short_uuid(cls):
uuidChars = ("a", "b", "c", "d", "e", "f",
"g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
"t", "u", "v", "w", "x", "y", "z", "0", "1", "2", "3", "4", "5",
"6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I",
"J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V",
"W", "X", "Y", "Z")
uuid = str(uuid4()).replace('-', '')
result = ''
for i in range(0, 8):
sub = uuid[i * 4: i * 4 + 4]
x = int(sub, 16)
result += uuidChars[x % 0x3E]
return result
@classmethod
def tihuan_a_return(cls, item, tihuanlujing,response=None):
txt = item['xiangqing']
"""
替换我们得网页里面得a标签以及实现我们得a标签并且放入我们得item中,我们必须实现a_fun方法
:param item:我们要放入得参数的对象
:param tihuanlujing: 要实现得替换路径得文件路径的前缀
:return:
"""
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
panDuanNone = lambda x: '_' if x is None else x
html = etree.HTML(txt)
c='//a[@href and (' + cls.jiewei_href_contains() + ')]'
alis = html.xpath('//a[@href and (' + cls.jiewei_href_contains() + ')]')
for alis_single in alis:
single_a_file = {}
href = str(alis_single.xpath('@href')[0])
content = str(panDuanNone(alis_single.xpath('string(.)')))
if content.strip() in '':
content='_'
single_a_file['file_name'] = content
# 每次只需要修改这里我们实际的下载链接地址
old_url = href
if href.lower().startswith('http'):
single_a_file['file_url']=old_url
elif response!=None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')):
single_a_file['file_url']=response.urljoin(old_url)
elif response!=None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')):
single_a_file['file_url']=response.urljoin(old_url)
else:
single_a_file['file_url'] = cls.a_fun(cls,href)
houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui
txt = txt.replace(old_url, tihuanlujing + new_url)
single_a_file['new_file'] = new_url
try:
item['wenjian'].append(single_a_file)
except:
item['wenjian'] = [single_a_file]
item['xiangqing'] = txt
@classmethod
def tihuan_img_return(cls, item, tihuanlujing,response=None):
txt=item['xiangqing']
"""
替换我们得网页里面得src标签以及实现我们得src标签并且放入我们得item中
:param item:我们要放入得参数的对象
:param tihuanlujing: 要实现图片得替换路径得图片路径的前缀
:return:
"""
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
panDuanNone = lambda x: '_' if x is None else x
html = etree.HTML(txt)
imglis = html.xpath('//img[@src and (' + cls.jiewei_src_contains() + ')]')
for imglis_single in imglis:
single_src_file = {}
src = str(imglis_single.xpath('@src')[0])
content = str(panDuanNone(imglis_single.xpath('string(.)')))
if content.strip() in '':
content='_'
single_src_file['file_name'] = content
old_url = src
# 每次只需要修改这里我们实际的下载链接地址
if old_url.lower().startswith('http'):
single_src_file['file_url']=old_url
elif response != None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')):
single_src_file['file_url'] = response.urljoin(old_url)
elif response !=None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')):
single_src_file['file_url'] = response.urljoin(old_url)
else:
single_src_file['file_url'] = cls.img_fun(cls,src)
houzui = single_src_file['file_url'][single_src_file['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui
txt = txt.replace(old_url, tihuanlujing + new_url)
single_src_file['new_file'] = new_url
try:
item['wenjian'].append(single_src_file)
except:
item['wenjian'] = [single_src_file]
item['xiangqing'] = txt
@abstractmethod
def a_fun(self,href):
"""
这个就是把href操作过后的结果为我们返回
:param href:
:return:
"""
pass
@abstractmethod
def img_fun(self, src):
"""
这个就是把src操作过后的结果为我们返回
:param src:
:return:
"""
pass
# 公共的异步插入
class Asyninser(object):
'''
实现初始化
def __init__(self,dbpool):
self.dbpool=dbpool
实现do_insert,
def do_insert(self, cursor, item):
'''
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值
"""
数据库建立连接
:param settings: 配置参数
:return: 实例化参数
"""
adbparams = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DATABASE'],
user=settings['MYSQL_USER'],
password=settings['MYSQL_PASSWORD'],
cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
)
# 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
# 返回实例化参数
return cls(dbpool)
def close_spider(self, spider):
logging.info('爬虫运行完毕了')
def process_item(self, item, spider):
"""
使用twisted将MySQL插入变成异步执行通过连接池执行具体的sql操作返回一个对象
"""
asynItem = copy.deepcopy(item)
query = self.dbpool.runInteraction(self.do_insert, asynItem) # 指定操作方法和操作数据
# 添加异常处理
query.addErrback(self.handle_error,asynItem,spider) # 处理异常
return asynItem
def handle_error(self, failure,asynItem,spider):
if failure:
# 打印错误信息
logging.info('----------数据库插入异常信息--------')
logging.info(failure)
logging.info('---------异常信息结束--------')

BIN
demo1/__pycache__/Util.cpython-37.pyc

Binary file not shown.

BIN
demo1/__pycache__/custom_settings_conf.cpython-37.pyc

Binary file not shown.

BIN
demo1/__pycache__/items.cpython-37.pyc

Binary file not shown.

BIN
demo1/__pycache__/middlewares.cpython-37.pyc

Binary file not shown.

BIN
demo1/__pycache__/pipelines.cpython-37.pyc

Binary file not shown.

BIN
demo1/__pycache__/settings.cpython-37.pyc

Binary file not shown.

11
demo1/crawl.py

@ -0,0 +1,11 @@
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
# myspd1是爬虫名
process.crawl('myspd1')
process.crawl('myspd2')
process.crawl('myspd3')
process.start()

390
demo1/custom_settings_conf.py

@ -0,0 +1,390 @@
import datetime
current_day = datetime.datetime.now()
# 查策网
custom_settings_conf_chacewang = {
# 如果要整个网站续爬就,把参数设置为False ,如果想整个网站都遍历一次那么就把参数改为True
#'ISQUANPA': False,
'LOG_FILE': "logs/chacewang_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 8,
# 并发速度
'CONCURRENT_REQUESTS': 3,
'ITEM_PIPELINES': {
'demo1.pipelines.MysqlYiBUPipeline': 678,
}
}
# 科学技术部
custom_settings_conf_kexujishubu = {
# 是否全爬 是全爬就写true 不是全爬就写false
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE': "logs/kexujishubu_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
'demo1.pipelines.kexujishubuPipeline': 679
}
}
# 工信部
custom_settings_conf_gongyehexinxihuabu = {
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE': "logs/gongyehexinxihuabu_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
'demo1.pipelines.gongyehexinxihuabuPipline': 680
}
}
# 国家自然科学基金委员会
custom_settings_conf_ziranweiyuanhui = {
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/ziranweiyuanhui_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#火炬中心
custom_settings_conf_huojuzhognxin = {
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/huojuzhongxin_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#发改委
custom_settings_conf_fagaiwei = {
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/fagaiwei_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#文化和旅游部
custom_settings_conf_wenhuahelvyoubu = {
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/wenhuahelvyoubu_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#中小企业总局
custom_settings_conf_zhongxiaoqiyezongju = {
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/zhongxiaoqiyezongju_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#山西小企业发展促进局
custom_settings_conf_cujinjuSpider = {
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/fazhancujinju_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#山西省科技厅
custom_settings_conf_sxkejitingSpider = {
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/sxkejiting_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#山西省商务厅
custom_settings_conf_sxShangwutingSpider={
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/sxshangwuting_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#山西省工业和信息厅
custom_settings_conf_sxgongxintingSpider={
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/sxgongxinting_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#山西综改示范区
custom_settings_conf_sxzonggaishifanSpider={
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/sxzonggaishifanqu_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#山西发改委
custom_settings_conf_sxfagaiweiSpider={
'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/sxfagaiwei_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#太原市科学技术局
custom_settings_conf_taiyuankexuejishujuSpider={
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/taiyuankjj_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#太原市工业和信息化局
custom_settings_conf_taiyuangongyehexinxihuajuSpider={
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/taiyuangongyehexinxihuaju_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#太原市商务局
custom_settings_conf_taiyuanshangwujuSpider={
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/taiyuanshangwuju_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 1,
# 并发速度
'CONCURRENT_REQUESTS': 16,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}
#企策通
custom_settings_conf_qicetongSpider={
'SHENBAOTONGZHI':"M7EkvSokQa3QVgX6WFf5LP",
#'ISQUANPA': False,
# LOG_ENABLED 默认: True,启用 logging
# LOG_ENCODING 默认: 'utf-8',logging 使用的编码
# LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG',log 的最低级别
# LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出(及错误)将会被重定向到log 中。
# 例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
'LOG_FILE' :"logs/qicetong_{}_{}.log".format(current_day.year, current_day.month, current_day.day),
# 下载时间间隔
'DOWNLOAD_DELAY': 8,
# 并发速度
'CONCURRENT_REQUESTS': 3,
'DOWNLOADER_MIDDLEWARES': {
'demo1.middlewares.DingZhiCookieMiddleware': None,
},
'ITEM_PIPELINES': {
#不带写了,用一个
'demo1.pipelines.ziranweiyuanhuiPipline': 681
}
}

2
demo1/items.py

@ -27,8 +27,10 @@ class Shouyelianjie(scrapy.Item):
shijian = scrapy.Field()#发布时间
xiangqing=scrapy.Field()#详情内容
biaoqian=scrapy.Field()#标签 比如事后资助 人才认定与资助
diqu=scrapy.Field()#存地区的地址比如我们默认的应该是100000
#文件list(包括图片)
wenjian=scrapy.Field()
#是否存在
count=scrapy.Field()
yuanwenurl=scrapy.Field()#原文链接

964
demo1/logs/chacewang_2020_9.log

@ -0,0 +1,964 @@
2020-09-15 11:13:27 [scrapy.extensions.telnet] INFO: Telnet Password: dc7ac6e8f6616faa
2020-09-15 11:13:27 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:13:28 [root] INFO: 我是RundomUserAgentMiddleware
2020-09-15 11:13:28 [root] INFO: 我是DingZhiCookieMiddleware
2020-09-15 11:13:28 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'demo1.middlewares.RundomUserAgentMiddleware',
'demo1.middlewares.DingZhiCookieMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:13:28 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:13:28 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.MysqlYiBUPipeline']
2020-09-15 11:13:28 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:13:28 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:13:28 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:13:29 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=1&citycode=&more=False> (referer: None)
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/72721
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/72210
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/71141
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/70608
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/70941
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/69801
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/69805
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/69804
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68878
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68458
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68456
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68411
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68455
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68167
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68969
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/67377
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/67388
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68453
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/67188
2020-09-15 11:13:29 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66875
2020-09-15 11:13:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=2&citycode=&more=False> (referer: None)
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/69799
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/69810
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68426
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/69811
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68415
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/69812
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68421
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68423
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65867
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65904
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65903
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65855
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65889
2020-09-15 11:13:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65850
2020-09-15 11:13:40 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/64663
2020-09-15 11:13:40 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65908
2020-09-15 11:13:40 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65870
2020-09-15 11:13:40 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/63857
2020-09-15 11:13:40 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/63631
2020-09-15 11:13:40 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/63242
2020-09-15 11:13:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=3&citycode=&more=False> (referer: None)
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/69795
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/69786
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/69279
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68459
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68971
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68414
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68975
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68427
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/67152
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68418
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66918
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66500
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65858
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65890
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68420
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65909
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65295
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65678
2020-09-15 11:13:48 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65862
2020-09-15 11:13:59 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=4&citycode=&more=False> (referer: None)
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/55831
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/55836
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/55839
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/55841
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/46922
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/46923
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/46924
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/42979
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/42994
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40859
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/42997
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40330
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40327
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/39293
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40306
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40307
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40338
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40309
2020-09-15 11:13:59 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40310
2020-09-15 11:14:10 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/News/PIndex_New?searchText=&pageindex=1&pageSize=20&chaPlate=1&citycode=&cityJudge=> (referer: https://www.chacewang.com/news/PIndex_New?chaPlate=1&citycode=&more=False)
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66899
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66901
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66916
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66903
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66917
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/67281
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66905
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66907
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66497
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66486
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66904
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/68422
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66498
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/66014
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65854
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65861
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65869
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65853
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65515
2020-09-15 11:14:11 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65637
2020-09-15 11:14:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/News/PIndex_New?searchText=&pageindex=1&pageSize=20&chaPlate=2&citycode=&cityJudge=> (referer: https://www.chacewang.com/news/PIndex_New?chaPlate=2&citycode=&more=False)
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/62685
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/62398
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65888
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/62107
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/61458
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/61395
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/61389
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/61396
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/59300
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/58975
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/59298
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/58832
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/58833
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/58091
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/57600
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/57589
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/56711
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/56730
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/56521
2020-09-15 11:14:19 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/56513
2020-09-15 11:14:28 [scrapy.extensions.logstats] INFO: Crawled 6 pages (at 6 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:14:31 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/News/PIndex_New?searchText=&pageindex=1&pageSize=20&chaPlate=3&citycode=&cityJudge=> (referer: https://www.chacewang.com/news/PIndex_New?chaPlate=3&citycode=&more=False)
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65893
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65896
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65900
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65898
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65865
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65902
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/63856
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65857
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/65866
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/63386
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/62268
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/62269
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/62271
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/62120
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/62109
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/62115
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/71669
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/61388
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/61391
2020-09-15 11:14:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/60721
2020-09-15 11:14:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/News/PIndex_New?searchText=&pageindex=1&pageSize=20&chaPlate=4&citycode=&cityJudge=> (referer: https://www.chacewang.com/news/PIndex_New?chaPlate=4&citycode=&more=False)
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40341
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40342
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40312
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40345
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40346
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40317
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40356
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/32286
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/31963
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/40357
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/31047
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/31048
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/31049
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/31040
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/31044
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/31050
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/31041
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/31046
2020-09-15 11:14:41 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/31051
2020-09-15 11:14:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/NewsDetail/40340> (referer: https://www.chacewang.com/news/PIndex_New?chaPlate=4&citycode=&more=False)
2020-09-15 11:14:50 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.chacewang.com/news/NewsDetail/40340>
{'biaoqian': ['科研立项'],
'biaoti': '2019年度专项项目极地基础科学前沿项目指南',
'jianjie': '国家自然科学基金委员会现启动“极地基础科学前沿”专项项目, '
'从冰下基岩和湖泊科学钻探、南极气候环境演化和北极多圈层相互作用三个角度,开展探索极地海-陆-气-冰-生态耦合系统的基础科学问题研究。',
'laiyuan': '国家自然科学基金委员会',
'leixing': '申报指南',
'lianjie': 'https://www.chacewang.com/news/NewsDetail/40340',
'shijian': '2019-10-24',
'xiangqing': '<div >\n'
'<p >\n'
'\t<span '
'>南北两极作为全球治理新焦点、科技竞争新高地、海上新通道和资源新产地,已成为人类活动发展的“新疆域”以及世界大国经略全球的战略要地。为落实习近平总书记 '
'“认识南极、保护南极、利用南极”等关于极地的批示精神,充分发挥国家自然科学基金根据国家科技发展战略,吸引和调动全国高等院校、科研机构的力量解决国家重大需求背后的基础科学问题的支撑作用,为突破极地变化预测的关键技术瓶颈奠定理论基础,为我国应对气候变化和参与全球治理提供科学支撑。国家自然科学基金委员会现启动“极地基础科学前沿”专项项目, '
'从冰下基岩和湖泊科学钻探、南极气候环境演化和北极多圈层相互作用三个角度,开展探索极地海-陆-气-冰-生态耦合系统的基础科学问题研究。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>一、科学目标</span></strong>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'发展极地深冰钻、多平台协同观测等技术手段,揭示极地多圈层相互作用过程和机理,评估南极冰盖-冰架-海冰系统的不稳定性及其潜在影响,解析北极快速变化的关键物理-化学-生物过程与主要驱动因素,提高对极地变化的预测能力,增强我国在极地科学领域的学术话语权。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>二、拟资助研究方向和研究内容</span></strong>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>(一)东南极古大陆的早期演化</span></strong>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'基于冰下基岩钻探、各类露头剖面和地球物理方法等,研究南极大陆早期陆核的形成过程及其与澳大利亚、非洲等大陆的亲缘关系,揭示后期大陆块体聚合的时限、过程和机制,构建东南极古大陆从初始成核到最终聚陆的历史框架。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>(二)南极冰下湖科学钻探选址与研究</span></strong>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'通过冰雷达和航空遥感等技术手段,开展冰下湖科学钻探选址,对冰层热熔钻孔倾斜和纠斜机理、钻孔闭合及其对钻具冻胀机理、冰下湖体系的理化参数与水质特征进行研究,探索南极冰下湖的形成演化过程和冰下环境的生命形态。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>(三)南极冰盖结构与动力学模型</span></strong><strong/>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'利用航空遥感和现场观测等技术手段,研究东南极冰盖的冰层结构和底部融水过程,分析冰下地热通量和深部冰温分布,获取冰盖接地线区域的冰下精细地形,构建可靠的冰盖动力学模型,定量估算冰盖的物质平衡和稳定性。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>(四)东南极海洋环流与冰架的相互作用</span></strong><strong/>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003 '
'通过高分辨率数值试验和观测资料分析,研究东南极多尺度海洋环流对冰架底部质量平衡的影响、冰架出流水对海洋环境的影响、冰架-海洋界面的边界层过程及其参数化方案,提高对冰架-海洋耦合系统的模拟和预测能力。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>(五)西南极冰-海相互作用与海洋生态系统</span></strong><strong/>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003 '
'通过环境与生态的多尺度综合观测、现场实验及数据与模型的综合分析,研究西南极冰-海环境和生态结构的时空变异、冰-海相互作用对海洋过程的调控机理,认知气候变化对生物生产力、种群结构和碳通量的潜在影响。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>(六)南极海冰变化的机制及影响</span></strong><strong/>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003 '
'利用耦合模式、卫星遥感数据及资料同化技术,研究南极海冰范围和体积在全球变化背景下的缓变与突变过程及机制,分析海冰变化对南极冰盖和气候系统的影响,为预估南极海冰变化及其全球效应提供科学依据和技术支撑。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>(七)南极冰盖对全球增温的敏感性</span></strong><strong/>\n'
'</p>\n'
'<p >\n'
'\t<span '
'>利用资料诊断、理论分析和数值模拟等手段,研究气候变暖对南极冰盖影响的程度、途径、时空变化特征及机理,建立全球增温影响南极冰盖的物理图像,评估南极冰盖对气候变暖响应的敏感性及可能产生的全球效应。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>(八)北极大气多要素变化观测与诊断研究</span></strong>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003 '
'应用先进的地基和星基环境光学装备和技术等,开展北极大气痕量气体组分和气溶胶的长期监测,建立大气环境参数综合分析方法,获得对流层大气关键成分的区域和垂直分布特征,揭示大气辐射强迫对北极快速变化的贡献。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>(九)北极快速变化的能量过程研究</span></strong>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003 '
'通过北极海冰-大气与海冰-海洋界面上的热通量观测与分析,研究北极大气、海洋过程对北极气候系统中能量分布与输运的影响,揭示影响北极快速变化的能量收支关键过程,提升对北极未来变化趋势的预测能力。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>(十)环北极海洋初级生产过程与生源要素循环</span></strong><strong/>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003 '
'通过对典型北极海冰快速减退区域走航、船基和遥感观测,冰浮标和潜标周年多要素同步观测,揭示北冰洋营养盐、初级生态过程和浮游植物的变化规律,评估海洋生源要素循环和生物泵过程对北极快速变化的响应和反馈。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>(十一)环北极海-冰-气界面物质交换及其气候效应</span></strong><strong/>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003 '
'通过对典型北极海冰快速减退区域走航和冰基观测,获取生源活性气体等挥发性成分和半挥发性成分等物质的海-气或冰-气交换通量, '
'揭示其时空分布格局、变化趋势、源汇及转化机制,评估其对北极快速变化的响应和反馈。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>(十二)北极快速变化归因与环境效应研究</span></strong><strong/>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003 '
'模拟并解析北极海冰快速融化的主要驱动因素,厘清其中自然和人为胁迫的相对贡献。研究极地植被变化及其对区域生物地球化学循环和生物物理特征的影响。定量评估北极海冰融化和极地植被变化对北极区域气候的可能影响。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>三、项目遴选的基本原则</span></strong>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'申请人应根据本专项拟解决的具体科学问题和项目指南公布的拟资助研究方向,自行拟定项目名称、科学目标、研究内容、技术路线和相应的研究经费等。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'除按照撰写提纲的要求外,申请书内容还须体现如下几个方面:(1)申请项目为实现总体科学目标的贡献;(2)针对指南中研究方向拟重点突破的科学问题、达到的研究目标或技术指标;(3)为实现总体科学目标和多学科集成需要,申请人应承诺在研究材料、基础数据和实验平台上的项目集群共享。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>四、资助计划</span></strong>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003 '
'本专项项目资助期限为4年,申请书中的研究期限应填写“2020年1月1日-2023年12月31日”,2019年专项项目拟资助12项,直接费用平均资助强度约350万元/项。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003</span><strong><span '
'>五、申请要求及注意事项</span></strong>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003(一)申请条件。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003本专项项目申请人应当具备以下条件:</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u20031. 具有承担基础研究课题的经历;</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u20032. 具有高级专业技术职务(职称);</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'在站博士后研究人员、正在攻读研究生学位以及无工作单位或者所在单位不是依托单位的人员不得作为申请人进行申请。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003(二)限项申请规定。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u20031. '
'本专项项目申请时不计入高级专业技术职务(职称)人员申请和承担总数3项的范围;正式接收申请到国家自然科学基金委员会作出资助与否决定之前,以及获得资助后,计入高级专业技术职务(职称)人员申请和承担总数3项的范围。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u20032.申请人和参与者只能申请或参与申请1项本专项项目。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u20033. 申请人同年只能申请1项专项项目中的研究项目。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003(三)申请注意事项。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u20031.申请书报送日期为2019年11月25日-27日16时。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u20032.本专项项目申请书采用在线方式撰写。对申请人具体要求如下:</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'(1)申请人在填报申请书前,应当认真阅读本申请须知、本项目指南和《2019年度国家自然科学基金项目指南》的相关内容,不符合项目指南和相关要求的申请项目不予受理。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'(2)本专项项目旨在紧密围绕核心科学问题,将对多学科相关研究进行战略性的方向引导和优势整合,成为一个专项项目集群。申请人应根据本专项拟解决的具体科学问题和项目指南公布的拟资助研究方向,自行拟定项目名称、科学目标、研究内容、技术路线和相应的研究经费等。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'(3)申请人登录科学基金网络信息系统https://isisn.nsfc.gov.cn/(没有系统账号的申请人请向依托单位基金管理联系人申请开户),按照撰写提纲及相关要求撰写申请书。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'(4)申请书中的资助类别选择“专项项目”,亚类说明选择“研究项目”,附注说明选择“科学部综合研究项目”,申请代码1应当选择地球科学部相应的申请代码。</span><strong><span '
'>以上选择不准确或未选择的项目申请将不予受理。</span></strong><span '
'>申请项目名称可以不同于研究方向名称,但应属该方向所辖之内的研究领域。每个专项项目的依托单位和合作研究单位数合计不得超过3个;主要参与者必须是项目的实际贡献者,不超过9人。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'(5)请按照“专项项目-研究项目申请书撰写提纲”撰写申请书时,</span><strong><span '
'>请在申请书正文开头注明“2019年度专项项目极地基础科学前沿之研究方向:XXX(按照上述12个研究方向之一填写)”。</span></strong>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'申请书应突出有限目标和重点突破,明确对实现本专项项目总体目标和解决核心科学问题的贡献。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'如果申请人已经承担与本专项项目相关的其他科技计划项目,应当在申请书正文的“研究基础与工作条件”部分论述申请项目与其他相关项目的区别与联系。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'(6)申请人应当认真阅读《2019年度国家自然科学基金项目指南》中预算编报须知的内容,严格按照《国家自然科学基金资助项目资金管理办法》《关于国家自然科学基金资助项目资金管理有关问题的补充通知》(财科教〔2016〕19号)以及《国家自然科学基金项目资金预算表编制说明》的要求,认真如实编报《国家自然科学基金项目资金预算表》。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'(7)申请人完成申请书撰写后,在线提交电子申请书及附件材料,下载打印最终PDF版本申请书,并保证纸质申请书与电子版内容一致。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'(8)申请人应及时向依托单位提交签字后的纸质申请书原件以及其他特别说明要求提交的纸质材料原件等附件。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u20033. '
'依托单位应对本单位申请人所提交申请材料的真实性和完整性进行审核,并在规定时间内将申请材料报送国家自然科学基金委员会。具体要求如下:</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'(1)应在规定的项目申请截止日期前提交本单位电子版申请书及附件材料,并统一报送经单位签字盖章后的纸质申请书原件(一式一份)及要求报送的纸质附件材料。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003(2)提交电子版申请书时,应通过信息系统逐项确认。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'(3)报送纸质申请材料时,还应包括由法定代表人签字、依托单位加盖公章的依托单位科研诚信承诺书(请在信息系统中下载)和申请项目清单,材料不完整不予接收。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'(4)可将纸质申请材料直接送达或邮寄至国家自然科学基金委员会项目材料接收工作组。采用邮寄方式的,请在项目申请截止时间前(以发信邮戳日期为准)以快递方式邮寄,以免延误申请,并在信封左下角注明“专项项目申请材料”。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'4.申请书由国家自然科学基金委员会项目材料接收工作组负责接收,材料接收工作组联系方式如下:</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'通讯地址:北京市海淀区双清路83号国家自然科学基金委员会项目材料接收工作组(行政楼101房间)</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003邮\u2003\u2003编:100085</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003联系电话:010-62328591</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u20035.本专项项目咨询方式:</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003国家自然科学基金委员会地球科学部综合与战略规划处</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003联系电话:010-62327157</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003(四)其他注意事项。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'1.为实现专项总体科学目标,获得资助的项目负责人应当承诺遵守相关数据和资料管理与共享的规定,项目执行过程中须关注与本专项其他项目之间的相互支撑关系。</span>\n'
'</p>\n'
'<p >\n'
'\t<span >\u2003\u2003'
'2.为加强项目之间的学术交流,促进专项项目集群的形成和多学科交叉,本专项项目集群将设专项项目总体指导组和管理协调组,每年举办一次资助项目的年度学术交流会,并将不定期地组织相关领域的学术研讨会。获资助项目负责人必须参加上述学术交流活动,并认真开展学术交流。</span>\n'
'</p>\n'
' <p>\n'
' 原文链接:\n'
' </p>\n'
' <p id="appendix">\n'
' <a '
'href="http://www.nsfc.gov.cn/publish/portal0/tab442/info76520.htm" '
'target="_blank">http://www.nsfc.gov.cn/publish/portal0/tab442/info76520.htm</a>\n'
' </p>\n'
'\n'
' <div ><a href="#" /><a href="#" /><a '
'href="#" /><a href="#" /><a href="#" /><a href="#" /><a '
'href="#" /></div>\n'
' </div>'}
2020-09-15 11:14:50 [root] INFO: 插入完成
2020-09-15 11:14:59 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/NewsDetail/74898> (referer: https://www.chacewang.com/news/PIndex_New?chaPlate=3&citycode=&more=False)
2020-09-15 11:14:59 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.chacewang.com/news/NewsDetail/74898>
{'biaoqian': ['商贸物流'],
'biaoti': '关于印发《推动物流业制造业深度融合创新发展实施方案》的通知(发改经贸〔2020〕1315号)',
'jianjie': '国家发展改革委会同工业和信息化部等部门和单位研究制定了《推动物流业制造业深度融合创新发展实施方案》,现印发给你们,请认真贯彻执行。',
'laiyuan': '发改委',
'leixing': '政策动态',
'lianjie': 'https://www.chacewang.com/news/NewsDetail/74898',
'shijian': '2020-09-09',
'wenjian': ['《推动物流业制造业深度融合创新发展实施方案》',
'https://www.chacewang.com/Home/OssDownload/upload/NewsWordUpload/《推动物流业制造业深度融合创新发展实施方案》_20200910.pdf',
'/2020/09/78niIbLK_《推动物流业制造业深度融合创新发展实施方案》_20200910.pdf'],
'xiangqing': '<div >\n'
'<div >\n'
'\t<div >\n'
'\t</div>\n'
'\t<div >\n'
'\t\t<span '
'>各省、自治区、直辖市及计划单列市、新疆生产建设兵团发展改革委、工业和信息化主管部门、公安厅、财政厅、自然资源主管部门、交通运输厅(局、委)、农业农村(农牧)厅(局、委)、商务厅(局、委)、市场监管局(厅、委)、银保监局,各地区铁路监督管理局,民航各地区管理局,邮政管理局,各铁路局集团公司:</span>\n'
'\t</div>\n'
'<span ><span >\u2003\u2003'
'为贯彻落实党中央、国务院关于推动高质量发展的决策部署,做好“六稳”工作,落实“六保”任务,进一步推动物流业制造业深度融合、创新发展,推进物流降本增效,促进制造业转型升级,国家发展改革委会同工业和信息化部等部门和单位研究制定了《推动物流业制造业深度融合创新发展实施方案》,现印发给你们,请认真贯彻执行。</span></span><br/>\n'
'<br/>\n'
'\t<div >\n'
'\t\t<span >国家发</span><span >展改革委</span>\n'
'\t</div>\n'
'<span >\n'
'\t<div >\n'
'\t\t<span >工业和信息化部</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >公安部</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >财政部</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >自 然 资 源 部</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >交 通 运 输 部</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >农 业 农 村 部</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >商务部</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >市场监管总局</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >银\u2003保\u2003监\u2003会</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >国 家 铁 路 局</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >民航局</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >国 家 邮 政 局</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >中国国家铁路集团有限公司</span>\n'
'\t</div>\n'
'</span><span >\n'
'\t<div >\n'
'\t\t<span >2020年8月22日</span>\n'
'\t</div>\n'
'</span>\n'
'\t<div>\n'
'\t</div>\n'
'</div>\n'
' <p>\n'
' 附件:\n'
' </p>\n'
' <p id="appendix">\n'
' <a '
'href="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy/2020/09/78niIbLK_《推动物流业制造业深度融合创新发展实施方案》_20200910.pdf">《推动物流业制造业深度融合创新发展实施方案》</a>\n'
' <br/>\n'
' </p>\n'
' <p>\n'
' 原文链接:\n'
' </p>\n'
' <p id="appendix">\n'
' <a '
'href="https://www.ndrc.gov.cn/xwdt/tzgg/202009/t20200909_1237849.html" '
'target="_blank">https://www.ndrc.gov.cn/xwdt/tzgg/202009/t20200909_1237849.html</a>\n'
' </p>\n'
'\n'
' <div ><a href="#" /><a href="#" /><a '
'href="#" /><a href="#" /><a href="#" /><a href="#" /><a '
'href="#" /></div>\n'
' </div>'}
2020-09-15 11:14:59 [root] INFO: 插入完成
2020-09-15 11:15:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=1&citycode=RegisterArea_HBDQ_Shanxi_DaTongShi&more=False> (referer: None)
2020-09-15 11:15:05 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/6188
2020-09-15 11:15:05 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=1&citycode=RegisterArea_HBDQ_Shanxi_DaTongShi&more=False----这个就是一页啊
2020-09-15 11:15:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/NewsDetail/31052> (referer: https://www.chacewang.com/News/PIndex_New?searchText=&pageindex=1&pageSize=20&chaPlate=4&citycode=&cityJudge=)
2020-09-15 11:15:15 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.chacewang.com/news/NewsDetail/31052>
{'biaoqian': ['科研立项', '重大项目'],
'biaoti': '关于发布大气细颗粒物的毒理与健康效应重大研究计划2019年度项目指南的通告',
'jianjie': '国家自然科学基金委员会现发布“大气细颗粒物的毒理与健康效应”重大研究计划2019年度项目指南,请申请人及依托单位按项目指南中所述的要求和注意事项申请。',
'laiyuan': '国家自然科学基金委员会',
'leixing': '申报指南',
'lianjie': 'https://www.chacewang.com/news/NewsDetail/31052',
'shijian': '2019-08-16',
'xiangqing': '<div >\n'
'<ul >\n'
'\t<div>\n'
'\t\t<div >\n'
'\t\t\t<span id="zoom">\n'
'\t\t\t<p>\n'
'\t\t\t\t'
'国家自然科学基金委员会现发布“大气细颗粒物的毒理与健康效应”重大研究计划2019年度项目指南,请申请人及依托单位按项目指南中所述的要求和注意事项申请。\n'
'\t\t\t</p>\n'
'</span>\n'
'\t\t</div>\n'
'\t</div>\n'
'\t<p>\n'
'\t\t</p><p>\n'
'\t\t\t</p><h1 >\n'
'\t\t\t\t<span >大气细颗粒物的毒理与健康效应重大研究计划2019年度项目指南</span>\n'
'\t\t\t</h1>\n'
'\t\t\n'
'\t\n'
'\t<p>\n'
'\t\t</p><p>\n'
'\t\t\t<span >\n'
'\t\t\t<ul >\n'
'\t\t\t\t<div>\n'
'\t\t\t\t\t<div >\n'
'\t\t\t\t\t\t<span id="zoom"> \n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t'
'结合我国大气污染特点,重点开展大气细颗粒物的毒理机制与健康危害研究,促进我国环境污染与健康领域研究的跨越发展,满足保护环境、改善民生的重大战略需求。本重大研究计划拟组织化学、环境、毒理学、生命、医学等多学科领域专家进行系统的基础研究和合作攻关,通过理论与方法学创新,在探明细颗粒物关键致毒组分与毒性机理的基础上,研究其生物效应和与健康危害相关的影响机制。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003<strong>一、科学目标</strong>\xa0\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'本重大研究计划拟围绕大气细颗粒物毒理机制与健康危害重大科学问题,解析雾霾关键毒性成分及其来源和暴露途径;提出并建立个体水平和人群水平暴露评估的方法,阐明我国雾霾高发地区大气细颗粒物污染的暴露特征;寻找并利用代谢组、遗传和表观遗传生物标志物,解析细颗粒物对关键信号路径的扰动作用,诠释我国特征大气细颗粒物毒性组分的生物学效应和毒理学机制;揭示大气细颗粒物可能诱发的机体应答与机体损伤作用机理,阐明大气细颗粒物污染与相关疾病的联系及其可能的影响机制。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003<strong>二、核心科学问题</strong>\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'本重大研究计划的核心科学问题是“大气细颗粒物的毒性组分、毒理机制与健康危害”。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003(一)典型区域大气细颗粒物毒性组分及暴露研究方法学。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003(二)大气细颗粒物毒性组分的生物学效应与毒理学机制。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003(三)大气细颗粒物的健康危害效应。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003<strong>三、2019年度重点资助研究方向</strong>\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u20032019年拟在前四年资助项目的基础上,对以下方向进行集成:\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003<b>“大气细颗粒物毒性组分的生物学效应与毒理学机制”</b>\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'结合大气细颗粒物毒性组分和主要健康结局,利用多组学等现代毒理学技术,解析大气细颗粒物与生物大分子相互作用机制及其对关键信号路径的扰动作用;诠释我国特征大气细颗粒物毒性组分的生物学效应和毒理学机制;揭示大气细颗粒物可能诱发的机体应答与机体损伤作用机理。 '
'<i>\xa0</i>\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003<strong>四、项目遴选的基本原则</strong>\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'2019年度只接收集成项目申请,申请人应根据本重大研究计划拟解决的具体科学问题和项目指南公布的拟资助研究方向,在认真总结和系统梳理本重大研究计划已有相关成果和进展、明确新的提升突破点的基础上,自行拟定项目名称、科学目标、研究内容、技术路线和相应的研究经费等。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'申请书内容应体现如下几个方面:①在集成方向相关领域近期取得的主要进展;②通过集成拟重点突破的研究内容、拟达到的研究目标或技术指标;③为实现总体科学目标和多学科集成的需要,申请人应承诺在研究材料、基础数据和实验平台上的共享。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003<strong>五、2019年度资助计划</strong>\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'2019年度拟资助集成项目1项,资助期限为3年,直接费用平均资助强度为1200万元/项(由指导专家和评审专家组根据目标凝练和评议情况确定资助额度)。申请书中的研究期限应填写“2020年1月1日-2022年12月31日”。<b/>\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003<strong>六、申请要求及注意事项</strong>\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003(一)申请条件。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003本重大研究计划项目申请人应当具备以下条件:\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u20031.具有承担基础研究课题的经历;\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u20032.具有高级专业技术职务(职称)。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'在站博士后研究人员、正在攻读研究生学位以及无工作单位或者所在单位不是依托单位的人员不得作为申请人进行申请。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003(二)限项申请规定。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'具有高级专业技术职务(职称)的人员,申请或参与申请本次发布的重大研究计划集成项目不限项。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003(三)申请注意事项。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u20031.申请书报送日期为2019年9月16日- 9月20日16时。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u20032.项目申请书采用在线方式撰写。对申请人具体要求如下:\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'(1)申请人在填报申请书前,应当认真阅读本项目指南和《2019年度国家自然科学基金项目指南》中申请须知和限项申请规定的相关内容,不符合项目指南和相关要求的申请项目不予受理。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'(2)本重大研究计划旨在紧密围绕核心科学问题,将对多学科相关研究进行战略性的方向引导和优势整合,成为一个项目集群。申请人应根据本重大研究计划拟解决的具体科学问题和项目指南公布的拟资助研究方向,自行拟定项目名称、科学目标、研究内容、技术路线和相应的研究经费等。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'(3)申请人登录科学基金网络信息系统https://isisn.nsfc.gov.cn/(没有系统账号的申请人请向依托单位基金管理联系人申请开户),按照撰写提纲及相关要求撰写申请书。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'(4)申请书中的资助类别选择“重大研究计划”,亚类说明选择“集成项目”,附注说明选择“大气细颗粒物的毒理与健康效应”,根据申请的具体研究内容选择相应的申请代码。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003<b>集成项目的合作研究单位不得超过4个。</b>\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'(5)申请人应当按照重大研究计划申请书的撰写提纲撰写申请书,应突出有限目标和重点突破,明确对实现本重大研究计划总体目标和解决核心科学问题的贡献。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'申请集成项目要求在本指南公布的集成方向下确定研究内容,各研究内容之间应突出相互合作、协调和有机联系,真正实现集成所确立的研究方向和目标。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'如果申请人已经承担与本重大研究计划相关的其他科技计划项目,应当在报告正文的“研究基础”部分论述申请项目与其他相关项目的区别与联系。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'(6)申请人应当认真阅读《2019年度国家自然科学基金项目指南》中预算编报须知的内容,严格按照《国家自然科学基金资助项目资金管理办法》《项目资金管理有关问题的补充通知》以及《国家自然科学基金项目资金预算表编制说明》的具体要求,按照“目标相关性、政策相符性、经济合理性”的基本原则,认真编制《国家自然科学基金项目预算表》。多个单位共同承担一个项目的,项目申请人和合作研究单位的参与者应当分别编制项目预算,经所在单位审核后,由申请人汇总编制。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'(7)申请人完成申请书撰写后,在线提交电子申请书及附件材料,下载打印最终PDF版本申请书,并保证纸质申请书与电子版内容一致。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'(8)申请人应及时向依托单位提交签字后的纸质申请书原件以及其他特别说明要求提交的纸质材料原件等附件。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'3.依托单位应对本单位申请人所提交申请材料的真实性、完整性和合规性进行审核;对申请人申报预算的目标相关性、政策相符性和经济合理性进行审核,并在规定时间内将申请材料报送国家自然科学基金委员会。具体要求如下:\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'(1)应在规定的项目申请截止日期(2019年9月20日16时)前提交本单位电子版申请书及附件材料,并统一报送经单位签字盖章后的纸质申请书原件(一式一份)及要求报送的纸质附件材料。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003(2)提交电子版申请书时,应通过信息系统逐项确认。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'(3)报送纸质申请材料时,还应提供由法定代表人签字、依托单位加盖公章的依托单位科研诚信承诺书,并附申请项目清单,材料不完整不予接收。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'(4)可将纸质申请材料直接送达或邮寄至国家自然科学基金委员会项目材料接收工作组。采用邮寄方式的,请在项目申请截止时间前(以发信邮戳日期为准)以快递方式邮寄,以免延误申请,并在信封左下角注明“重大研究计划项目申请材料”。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'4.申请书由国家自然科学基金委员会项目材料接收工作组负责接收,材料接收工作组联系方式如下:\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'通讯地址:北京市海淀区双清路83号,国家自然科学基金委员会项目材料接收工作组(行政楼101房间)\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003邮\u2003\u2003编:100085\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003联系电话:010-62328591\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u20035.本重大研究计划咨询方式:\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003国家自然科学基金委员会 化学科学部\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003联系电话:010-62327173\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003(四)其他注意事项。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'1.为实现重大研究计划总体科学目标和多学科集成,获得资助的项目负责人应当承诺遵守相关数据和资料管理与共享的规定,项目执行过程中应关注与本重大研究计划其他项目之间的相互支撑关系。\n'
'\t\t\t\t\t\t</p>\n'
'\t\t\t\t\t\t<p>\n'
'\t\t\t\t\t\t\t\u2003\u2003'
'2.为加强项目的学术交流,促进项目群的形成和多学科交叉与集成,本重大研究计划将每年举办一次资助项目的年度学术交流会,并将不定期地组织相关领域的学术研讨会。获资助项目负责人有义务参加本重大研究计划指导专家组和管理工作组所组织的上述学术交流活动。\n'
'\t\t\t\t\t\t</p>\n'
'</span>\n'
'\t\t\t\t\t</div>\n'
'\t\t\t\t</div>\n'
'\t\t\t</ul>\n'
'<br/>\n'
'</span>\n'
'\t\t</p>\n'
'\t\n'
'</ul>\n'
' <p>\n'
' 原文链接:\n'
' </p>\n'
' <p id="appendix">\n'
' <a '
'href="http://www.nsfc.gov.cn/publish/portal0/tab442/info76220.htm" '
'target="_blank">http://www.nsfc.gov.cn/publish/portal0/tab442/info76220.htm</a>\n'
' </p>\n'
'\n'
' <div ><a href="#" /><a href="#" /><a '
'href="#" /><a href="#" /><a href="#" /><a href="#" /><a '
'href="#" /></div>\n'
' </div>'}
2020-09-15 11:15:15 [root] INFO: 插入完成
2020-09-15 11:15:28 [scrapy.extensions.logstats] INFO: Crawled 12 pages (at 6 pages/min), scraped 3 items (at 3 items/min)
2020-09-15 11:15:31 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=2&citycode=RegisterArea_HBDQ_Shanxi_DaTongShi&more=False> (referer: None)
2020-09-15 11:15:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/7275
2020-09-15 11:15:31 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=2&citycode=RegisterArea_HBDQ_Shanxi_DaTongShi&more=False----这个就是一页啊
2020-09-15 11:15:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=3&citycode=RegisterArea_HBDQ_Shanxi_DaTongShi&more=False> (referer: None)
2020-09-15 11:15:37 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=3&citycode=RegisterArea_HBDQ_Shanxi_DaTongShi&more=False----没有发布过内容
2020-09-15 11:15:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=4&citycode=RegisterArea_HBDQ_Shanxi_DaTongShi&more=False> (referer: None)
2020-09-15 11:15:44 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/6192
2020-09-15 11:15:45 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/6189
2020-09-15 11:15:45 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/6193
2020-09-15 11:15:45 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/6195
2020-09-15 11:15:45 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/6190
2020-09-15 11:15:45 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/6194
2020-09-15 11:15:45 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=4&citycode=RegisterArea_HBDQ_Shanxi_DaTongShi&more=False----这个就是一页啊
2020-09-15 11:15:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=1&citycode=RegisterArea_HBDQ_Shanxi_YangQuanShi&more=False> (referer: None)
2020-09-15 11:15:57 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=1&citycode=RegisterArea_HBDQ_Shanxi_YangQuanShi&more=False----没有发布过内容
2020-09-15 11:16:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=2&citycode=RegisterArea_HBDQ_Shanxi_YangQuanShi&more=False> (referer: None)
2020-09-15 11:16:08 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/6205
2020-09-15 11:16:08 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=2&citycode=RegisterArea_HBDQ_Shanxi_YangQuanShi&more=False----这个就是一页啊
2020-09-15 11:16:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=3&citycode=RegisterArea_HBDQ_Shanxi_YangQuanShi&more=False> (referer: None)
2020-09-15 11:16:18 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=3&citycode=RegisterArea_HBDQ_Shanxi_YangQuanShi&more=False----没有发布过内容
2020-09-15 11:16:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=4&citycode=RegisterArea_HBDQ_Shanxi_YangQuanShi&more=False> (referer: None)
2020-09-15 11:16:27 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=4&citycode=RegisterArea_HBDQ_Shanxi_YangQuanShi&more=False----没有发布过内容
2020-09-15 11:16:28 [scrapy.extensions.logstats] INFO: Crawled 19 pages (at 7 pages/min), scraped 3 items (at 0 items/min)
2020-09-15 11:16:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=1&citycode=RegisterArea_HBDQ_Shanxi_JinZhongShi&more=False> (referer: None)
2020-09-15 11:16:39 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/9357
2020-09-15 11:16:39 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=1&citycode=RegisterArea_HBDQ_Shanxi_JinZhongShi&more=False----这个就是一页啊
2020-09-15 11:16:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=2&citycode=RegisterArea_HBDQ_Shanxi_JinZhongShi&more=False> (referer: None)
2020-09-15 11:16:47 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/7283
2020-09-15 11:16:47 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/7279
2020-09-15 11:16:47 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/6523
2020-09-15 11:16:47 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/6296
2020-09-15 11:16:47 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=2&citycode=RegisterArea_HBDQ_Shanxi_JinZhongShi&more=False----这个就是一页啊
2020-09-15 11:16:58 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=3&citycode=RegisterArea_HBDQ_Shanxi_JinZhongShi&more=False> (referer: None)
2020-09-15 11:16:58 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=3&citycode=RegisterArea_HBDQ_Shanxi_JinZhongShi&more=False----没有发布过内容
2020-09-15 11:17:12 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=4&citycode=RegisterArea_HBDQ_Shanxi_JinZhongShi&more=False> (referer: None)
2020-09-15 11:17:12 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=4&citycode=RegisterArea_HBDQ_Shanxi_JinZhongShi&more=False----没有发布过内容
2020-09-15 11:17:20 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=1&citycode=RegisterArea_HBDQ_Shanxi_XinZhouShi&more=False> (referer: None)
2020-09-15 11:17:20 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=1&citycode=RegisterArea_HBDQ_Shanxi_XinZhouShi&more=False----没有发布过内容
2020-09-15 11:17:28 [scrapy.extensions.logstats] INFO: Crawled 24 pages (at 5 pages/min), scraped 3 items (at 0 items/min)
2020-09-15 11:17:31 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=2&citycode=RegisterArea_HBDQ_Shanxi_XinZhouShi&more=False> (referer: None)
2020-09-15 11:17:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/7697
2020-09-15 11:17:31 [root] INFO: 这个链接已经爬过了-----:https://www.chacewang.com/news/NewsDetail/5416
2020-09-15 11:17:31 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=2&citycode=RegisterArea_HBDQ_Shanxi_XinZhouShi&more=False----这个就是一页啊
2020-09-15 11:17:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=3&citycode=RegisterArea_HBDQ_Shanxi_XinZhouShi&more=False> (referer: None)
2020-09-15 11:17:39 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=3&citycode=RegisterArea_HBDQ_Shanxi_XinZhouShi&more=False----没有发布过内容
2020-09-15 11:17:51 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.chacewang.com/news/PIndex_New?chaPlate=4&citycode=RegisterArea_HBDQ_Shanxi_XinZhouShi&more=False> (referer: None)
2020-09-15 11:17:51 [root] INFO: url:https://www.chacewang.com/news/PIndex_New?chaPlate=4&citycode=RegisterArea_HBDQ_Shanxi_XinZhouShi&more=False----没有发布过内容
2020-09-15 11:17:51 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:17:51 [root] INFO: 爬虫运行完毕了
2020-09-15 11:17:51 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 11933,
'downloader/request_count': 27,
'downloader/request_method_count/GET': 27,
'downloader/response_bytes': 322847,
'downloader/response_count': 27,
'downloader/response_status_count/200': 27,
'elapsed_time_seconds': 263.132746,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 17, 51, 420381),
'item_scraped_count': 3,
'log_count/DEBUG': 30,
'log_count/INFO': 209,
'request_depth_max': 2,
'response_received_count': 27,
'scheduler/dequeued': 27,
'scheduler/dequeued/memory': 27,
'scheduler/enqueued': 27,
'scheduler/enqueued/memory': 27,
'start_time': datetime.datetime(2020, 9, 15, 3, 13, 28, 287635)}
2020-09-15 11:17:51 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:15 [scrapy.extensions.telnet] INFO: Telnet Password: a15e61896a902e76
2020-09-16 08:47:15 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:15 [root] INFO: 我是RundomUserAgentMiddleware
2020-09-16 08:47:15 [root] INFO: 我是DingZhiCookieMiddleware
2020-09-16 08:47:15 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'demo1.middlewares.RundomUserAgentMiddleware',
'demo1.middlewares.DingZhiCookieMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:15 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:15 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.MysqlYiBUPipeline']
2020-09-16 08:47:15 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:15 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:15 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6024
2020-09-16 08:47:15 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/gongyehexinxihuabu_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

413
demo1/logs/fagaiwei_2020_9.log

@ -0,0 +1,413 @@
2020-09-15 11:21:50 [scrapy.extensions.telnet] INFO: Telnet Password: 3b5a2054bf5cce72
2020-09-15 11:21:50 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:21:50 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:21:50 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:21:50 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:21:50 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:21:50 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:21:50 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:21:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ndrc.gov.cn/xxgk/zcfb/tz/index.html> (referer: None)
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200826_1236873.html
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200824_1236679.html
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200820_1236352.html
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200820_1236353.html
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200818_1236238.html
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200811_1235815.html
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200811_1235817.html
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200810_1235755.html
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200807_1235742.html
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200806_1235650.html
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200805_1235592.html
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200803_1235506.html
2020-09-15 11:21:50 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202008/t20200804_1235517.html
2020-09-15 11:21:51 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202007/t20200731_1235247.html
2020-09-15 11:21:51 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202007/t20200731_1235241.html
2020-09-15 11:21:51 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202007/t20200731_1235150.html
2020-09-15 11:21:51 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202007/t20200731_1235153.html
2020-09-15 11:21:51 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202007/t20200731_1235257.html
2020-09-15 11:21:51 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202007/t20200731_1235148.html
2020-09-15 11:21:51 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202007/t20200728_1234739.html
2020-09-15 11:21:51 [root] INFO: 这个链接已经爬过了-----:https://www.ndrc.gov.cn/xxgk/zcfb/tz/202007/t20200728_1234637.html
2020-09-15 11:21:51 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/t20200915_1238749.html> (referer: https://www.ndrc.gov.cn/xxgk/zcfb/tz/index.html)
2020-09-15 11:21:51 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/t20200915_1238749.html>
{'biaoti': '关于村庄建设项目施行简易审批的指导意见(发改农经〔2020〕1337号)',
'laiyuan': '发改委',
'lianjie': 'https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/t20200915_1238749.html',
'shijian': '2020-09-15',
'xiangqing': '<div class="article_con article_con_notitle">\n'
' <div style="text-align: center;"><span '
'style="font-family: SimSun; font-size: '
'24px;"><strong>关于村庄建设项目施行简易审批的指导意见</strong></span></div><div '
'style="text-align: center;"><br></div><span style="font-family: '
'SimSun; font-size: 16px;"><div style="text-align: '
'center;">发改农经〔2020〕1337号</div></span><br><span '
'style="font-family: SimSun; font-size: '
'16px;">各省、自治区、直辖市及计划单列市、新疆生产建设兵团发展改革委、自然资源主管部门、农业农村(农牧)厅(局、委):</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'人居环境、农村供水、村内道路、文化体育等村庄建设项目量大面广,投资规模较小,技术方案相对简单,建设内容较为单一。对于按照固定资产投资管理的小型村庄建设项目施行简易审批,优化审批程序,简化报批内容,改进审批方式,有利于提高审批实效,节省报批成本,加快项目推进实施,有利于推动生态宜居美丽乡村建设,尽快补上全面小康“三农”领域突出短板。现就村庄建设项目施行简易审批提出以下意见:</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>一、把握村庄建设项目施行简易审批的基本原则</strong></span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'村庄建设项目施行简易审批,要坚持规划引领、统筹谋划,守好耕地和生态保护红线,合理确定村庄建设项目布局,有序推进美丽乡村建设;坚持务实管用、便捷高效,结合不同地区和领域实际,出台行之有效的操作办法,解决各方面反映突出的难点堵点;坚持依法依规、循序渐进,在法治框架下探索完善优化审批流程和审批内容,并为推进相关政策法规修订积累经验;坚持各方协同、创新模式,通过部门间信息联通和业务协作,构建适应新时代特点的新型审批服务体系。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>二、明确简易审批适用范围</strong></span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'根据《政府投资条例》,具有审批权限的地方投资主管部门要会同有关部门,重点围绕生活垃圾污水、厕所粪污处理、村容村貌提升等农村人居环境建设,以及农村供排水、村内道路、文化体育等村庄建设领域,结合本地区实际制定并发布施行简易审批的村庄建设项目范围。鼓励对村域内实施的村庄建设项目施行简易审批。投资规模较大、技术方案相对复杂的工程,以及关系人民群众生命财产安全的房屋修造类、能源类等项目,不得适用简易审批。已经纳入城市一体管理的村庄,按照有关规定执行。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>三、简化审批程序和审批环节</strong></span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'对于适用简易审批的政府直接投资项目,地方投资主管部门要简化审批程序,可以采取审批可行性研究报告的方式,合并办理项目建议书、可行性研究报告、初步设计等审批环节。经批准的可行性研究报告,作为项目招标采购、建设实施和竣工验收的依据。对于企业投资项目,项目单位应当按照有关规定办理核准、备案手续。审批、核准、备案等投资决策程序完成后,方可履行资金申请和审批程序,并在资金申请报告中列明项目基本情况、前期工作完成情况、申请资金的政策依据等内容。各地要依法研究简化项目开工前涉及的用地、规划等审批事项办理程序。使用集体建设用地开展建设的,项目单位无须办理建设项目用地预审与选址意见书。鼓励地方各级政府采取区域综合评估方式,取代对单个项目进行评价,支持采取容缺后补、告知承诺等便利化措施,依法取消和减少村庄建设项目需要办理的审批事项。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>四、创新审批服务方式</strong></span><br><span style="font-family: '
'SimSun; font-size: 16px;">\u2003\u2003'
'鉴于适用简易审批村庄建设项目单体规模偏小、技术相对简单,允许地方结合实际,将小型村庄建设项目涉及的审批事项依法委托乡镇政府实施。提倡简化申报材料,实行一窗受理、综合办理,通过并联审批压缩办理时限,探索开展部门联办、全程帮办,切实加快村庄建设项目推动进度。地方投资主管部门应当会同有关部门制订并通过投资项目在线审批监管平台发布和实施村庄建设项目简易审批流程,并探索以互联网、手机APP等方式,为项目单位提供在线办理、进度查询等服务,不断提升审批服务水平。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>五、合理确定前期工作深度要求</strong></span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'对于适用简易审批的村庄建设项目,要在加强论证、确保质量的前提下,根据行业规程规范,区分项目类型明确前期工作深度要求。可行性研究报告一般包括建设内容及规模、建设性质、建设地址、建设工期,布置图,投资规模、资金来源与落实情况,覆盖村组范围及服务人口、管护方式,村民会议或者村民代表会议、村民小组会议决议意见,以及法律法规明确的其他内容。各地不得在法律法规之外,自行设立其他证明材料或审查意见。地方行业主管部门要结合本地区实际和本领域特点,商同级投资主管部门制定可行性研究报告申报范本,允许项目单位自行编制可行性研究报告,鼓励采用表单方式明确文本内容,着力消除模糊和兜底条款,避免机械套用、简单比照城市建设项目。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>六、落实国家招标投标等政策规定</strong></span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'严格执行招标投标法及其实施条例、政府采购法及其实施条例以及《必须招标的工程项目规定》(国家发展改革委令2018年第16号),使用国有资金投资的各类村庄建设项目,施工单项合同估算价不超过400万元,重要设备、材料等货物采购单项合同估算价不超过200万元,勘察、设计、监理等服务采购单项合同估算价不超过100万元的,可依法不进行招标。整县整乡推进的村庄建设项目,其子项目由不同项目法人组织建设实施,且该子项目达不到必须招标的规模标准的,可以不进行招标。对利用扶贫资金实行以工代赈、需要使用农民工等特殊情况,按照国家有关规定可以不进行招标。对于采取招标方式的项目,不得在法律法规外,针对投资规模、工程造价、招标文件编制等设立其他审批审核程序。对于依法不进行招标的项目,要建立完善项目村民决策监督和建设主体责任追究机制,确保项目实施公平公正、公开透明,防止暗箱操作、利益输送等情况发生。要加强项目质量管理,严格按照合同开展验收。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>七、发挥村民决策和建设主体作用</strong></span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'谋划实施项目,应当采取座谈调研、入户调查等方式听取村民诉求,充分尊重村民意愿,保障村民参与集体决策。对于安排政府投资资金的村庄建设项目,要综合考虑村庄实际和工作基础,确定项目法人单位。具备条件的,可以由村民委员会、村集体经济组织等作为项目法人。以行政村为基本单元实施的村庄建设项目,鼓励项目法人组织村民投工投劳、就地取材等开展建设。支持将政府投资村庄建设项目产权划归村集体经济组织,由其承担管护责任,鼓励地方对管护费用给予适当补助,并采取“门前三包”、使用者协会等形式,引导受益农民通过认领等方式参与管护,确保村庄建设项目长期有效运行。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>八、强化保障措施确保政策落地落实</strong></span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'地方各级投资主管部门会同有关部门要把村庄建设项目施行简易审批摆上议事日程,加强组织领导,深入研究谋划,细化配套措施,以流程优化、内容简化、时限缩减为目标,推动村庄建设项目审批流程再造。地方各级尤其是县级行业主管部门要发挥贴近基层的优势,加强技术服务、工作指导和监督管理,确保村庄建设项目质量。各地要创新监管机制,加强权力运行公开,有效防控项目管理、实施等环节的廉政风险。要及时梳理总结村庄建设项目简易审批方面取得的新进展、新成效,宣传典型案例,推广经验做法,增进村级组织、农民群众等对相关工作的理解支持,为加快村庄建设项目实施,提高农村基础设施水平营造良好氛围。</span><br><br><div '
'style="text-align: center;"><span style="font-family: SimSun; '
'font-size: 16px;">国家发展改</span><span style="font-family: SimSun; '
'font-size: 16px;">革委</span></div><span style="font-family: '
'SimSun; font-size: 16px;"><div style="text-align: '
'center;"><span style="font-family: SimSun; font-size: 16px;">自 '
'然 资 源 部</span></div></span><span style="font-family: SimSun; '
'font-size: 16px;"><div style="text-align: center;"><span '
'style="font-family: SimSun; font-size: 16px;">农 业 农 村 '
'部</span></div></span><span style="font-family: SimSun; '
'font-size: 16px;"><div style="text-align: center;"><span '
'style="font-family: SimSun; font-size: '
'16px;">2020年8月27日</span></div></span><div style="white-space: '
'nowrap;"><br></div>\n'
' </div><div class="attachment"><div '
'class="attachment_l">附件:</div><div class="attachment_r"><p>\n'
'\n'
'</p></div></div>'}
2020-09-15 11:21:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/t20200909_1237860.html> (referer: https://www.ndrc.gov.cn/xxgk/zcfb/tz/index.html)
2020-09-15 11:21:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/t20200909_1237860.html>
{'biaoti': '关于组织开展行业协会商会经营服务性收费清理规范工作的通知(发改办价格〔2020〕632号)',
'laiyuan': '发改委',
'lianjie': 'https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/t20200909_1237860.html',
'shijian': '2020-09-09',
'xiangqing': '<div class="article_con article_con_notitle">\n'
' <div style="text-align: center;"><span '
'style="font-family: SimSun; font-size: '
'24px;"><strong>国家发展改革委办公厅关于组织开展行业</strong></span></div><span '
'style="font-family: SimSun; font-size: 16px;"><div '
'style="text-align: center;"><strong><span '
'style="font-size:24px;">协会商会经营服务性收费清理规范工作的通知</span></strong></div></span><div '
'style="text-align: center;"><br></div><span style="font-family: '
'SimSun; font-size: 16px;"><div style="text-align: '
'center;">发改办价格〔2020〕632号</div></span><br><span '
'style="font-family: SimSun; font-size: '
'16px;">工业和信息化部、民政部、财政部、自然资源部、人民银行、国资委、市场监管总局、银保监会、证监会办公厅(室),各省、自治区、直辖市发展改革委:</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'为持续深化“放管服”改革,进一步优化营商环境,根据《国务院办公厅关于进一步规范行业协会商会收费的通知》(国办发〔2020〕21号)要求,现就组织开展行业协会商会收费清理规范工作有关事项通知如下:</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>一、清理规范的目标</strong></span><br><span style="font-family: '
'SimSun; font-size: 16px;">\u2003\u2003'
'围绕行业协会商会经营服务性收费存在的突出问题,按照突出重点、分类规范的原则,通过深入清理规范,进一步打破服务垄断,坚决取消违法违规收费,提升收费规范性和透明度,降低偏高收费,切实降低实体经济运行成本。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>二、清理规范的措施</strong></span><br><span style="font-family: '
'SimSun; font-size: 16px;">\u2003\u2003'
'各部门要组织本行业内协会商会对收取的经营服务性等收费进行梳理,包括收费项目、收费内容、收费依据、收费主体、收费对象、收费标准、收费金额等。在此基础上对照相关法律法规和政策规定,对收费事项进行认真分析,按照以下要求开展清理规范工作。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(一)打破服务垄断。</strong>各部门要清理行业内协会商会开展的垄断性和强制性的服务项目,通过放开准入条件、引入多元化服务主体等方式实现服务价格市场化。对暂时无法破除垄断的,由行业协会商会按合理合法、补偿成本、略有盈余的原则确定收费标准,并经会员(代表)大会或理事会投票表决通过。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(二)取消违法违规收费项目。</strong>各部门应要求行业内协会商会收取会费的同时,明确所提供的基本服务项目,对提供的基本服务项目不得以有偿服务的形式另行收费,不得利用自身的强势地位强制服务并收费,全面清理取消不符合法律法规及相关政策规定收取的入会费、赞助费、会议费、培训费、考试费、评比表彰费等收费,并退还违法违规所得。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(三)降低收费标准。</strong>对收费标准偏高、盈余较多、使用不透明、企业与社会反映较强的部分重点领域,特别是银行、证券、保险、基金、期货、资产评估等履行法定职责的行业协会商会,各部门要组织开展成本审核,督促其综合考虑服务成本、会员经营状况、承受能力、行业发展水平等因素制定收费标准,降低偏高收费。\xa0\xa0'
'</span><br><span style="font-family: SimSun; font-size: '
'16px;">\u2003\u2003'
'<strong>(四)规范收费行为。</strong>各部门应要求行业协会商会按照法律法规关于经营者义务的相关规定和自愿有偿服务的原则,在宗旨和业务范围内开展有偿服务活动,规范相关收费行为,按照公平、合法、诚实守信的原则,公允确定并公开收费项目和标准,提供质价相符的服务。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>三、清理规范的组织实施</strong></span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(一)提高对清理规范工作的认识。</strong>此次清理规范工作时间紧、任务重,各地方、各部门要充分认识清理规范行业协会商会收费工作对减轻企业和社会负担的重要意义,结合实际和自身职责,进一步细化任务分工,明确时间表、路线图,确保各项任务落到实处,清理规范措施务求取得实效。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(二)集中公示行业协会商会收费。</strong>各部门要在清理规范的基础上,指导制定完善行业内协会商会服务规范,细化服务流程,提高服务质量,并要求行业协会商会于11月30日前在“信用中国”网站对清理规范后的收费情况进行公示,增加政策透明度,接受社会监督,未经公示的收费项目一律不得收取。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(三)及时报送清理规范情况。</strong>各地方、各部门要全面总结评估此次行业协会商会收费清理规范情况,将打破服务垄断、取消收费项目、降低收费标准、合计减负金额等情况梳理总结,形成书面材料(附光盘)于11月30日前报送国家发展改革委(价格司)。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(四)开展随机抽查复核。</strong>国家发展改革委将对各部门报送的清理规范情况进行汇总梳理,结合行业协会商会收费公示情况,会同相关部门针对发现的突出问题,选择部分行业协会商会进行抽查复核,深入了解实际收费情况。对抽查复核中发现的问题,将会同相关部门共同明确处理原则,提出具体处理意见,切实规范收费行为。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'联系人:卢\xa0 博\xa0 \xa068501724\xa0\xa0</span><br><span '
'style="font-family: SimSun; font-size: '
'16px;">\u2003\u2003\u2003\u2003\u2003\u2003李\xa0 硕\xa0 \xa0'
'68501921</span><br><br><div style="text-align: center;"><span '
'style="font-family: SimSun; font-size: 16px;">国家发展改</span><span '
'style="font-family: SimSun; font-size: '
'16px;">革委办公厅</span></div><span style="font-family: SimSun; '
'font-size: 16px;"><div style="text-align: '
'center;">2020年8月21日</div></span><div style="white-space: '
'nowrap;"><br></div>\n'
' </div><div class="attachment"><div '
'class="attachment_l">附件:</div><div class="attachment_r"><p>\n'
'\n'
'</p></div></div>'}
2020-09-15 11:21:54 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/t20200909_1237841.html> (referer: https://www.ndrc.gov.cn/xxgk/zcfb/tz/index.html)
2020-09-15 11:21:54 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/t20200909_1237841.html>
{'biaoti': '关于印发《推动物流业制造业深度融合创新发展实施方案》的通知(发改经贸〔2020〕1315号)',
'laiyuan': '发改委',
'lianjie': 'https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/t20200909_1237841.html',
'shijian': '2020-09-09',
'wenjian': [{'file_name': '《推动物流业制造业深度融合创新发展实施方案》',
'file_url': 'https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/P020200909333031287206.pdf',
'new_file': '/2020/09/QtppfyVD_P020200909333031287206.pdf'}],
'xiangqing': '<div class="article_con article_con_notitle">\n'
' <div class="TRS_Editor"><div style="text-align: '
'center;"><span style="font-family: SimSun; font-size: '
'24px;"><strong>关于印发《推动物流业制造业</strong></span></div><span '
'style="font-family: SimSun; font-size: 16px;"><div '
'style="text-align: center;"><strong><span '
'style="font-size:24px;">深度融合创新发展实施方案》的通知</span></strong></div></span><div '
'style="text-align: center;">\xa0</div><span style="font-family: '
'SimSun; font-size: 16px;"><div style="text-align: '
'center;">发改经贸〔2020〕1315号</div></span><div style="text-align: '
'center;">\xa0</div><span style="font-family: SimSun; font-size: '
'16px;">各省、自治区、直辖市及计划单列市、新疆生产建设兵团发展改革委、工业和信息化主管部门、公安厅、财政厅、自然资源主管部门、交通运输厅(局、委)、农业农村(农牧)厅(局、委)、商务厅(局、委)、市场监管局(厅、委)、银保监局,各地区铁路监督管理局,民航各地区管理局,邮政管理局,各铁路局集团公司:</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'为贯彻落实党中央、国务院关于推动高质量发展的决策部署,做好“六稳”工作,落实“六保”任务,进一步推动物流业制造业深度融合、创新发展,推进物流降本增效,促进制造业转型升级,国家发展改革委会同工业和信息化部等部门和单位研究制定了《推动物流业制造业深度融合创新发展实施方案》,现印发给你们,请认真贯彻执行。</span><br><br><div '
'style="text-align: center;"><span style="font-family: SimSun; '
'font-size: 16px;">国家发</span><span style="font-family: SimSun; '
'font-size: 16px;">展改革委</span></div><span style="font-family: '
'SimSun; font-size: 16px;"><div style="text-align: '
'center;"><span style="font-family: SimSun; font-size: '
'16px;">工业和信息化部</span></div></span><span style="font-family: '
'SimSun; font-size: 16px;"><div style="text-align: '
'center;"><span style="font-family: SimSun; font-size: '
'16px;">公\u2003\u2003安\u2003\u2003部</span></div></span><span '
'style="font-family: SimSun; font-size: 16px;"><div '
'style="text-align: center;"><span style="font-family: SimSun; '
'font-size: 16px;">财\u2003\u2003政\u2003\u2003'
'部</span></div></span><span style="font-family: SimSun; '
'font-size: 16px;"><div style="text-align: center;"><span '
'style="font-family: SimSun; font-size: 16px;">自 然 资 源 '
'部</span></div></span><span style="font-family: SimSun; '
'font-size: 16px;"><div style="text-align: center;"><span '
'style="font-family: SimSun; font-size: 16px;">交 通 运 输 '
'部</span></div></span><span style="font-family: SimSun; '
'font-size: 16px;"><div style="text-align: center;"><span '
'style="font-family: SimSun; font-size: 16px;">农 业 农 村 '
'部</span></div></span><span style="font-family: SimSun; '
'font-size: 16px;"><div style="text-align: center;"><span '
'style="font-family: SimSun; font-size: 16px;">商\u2003\u2003'
'务\u2003\u2003部</span></div></span><span style="font-family: '
'SimSun; font-size: 16px;"><div style="text-align: '
'center;"><span style="font-family: SimSun; font-size: '
'16px;">市场监管总局</span></div></span><span style="font-family: '
'SimSun; font-size: 16px;"><div style="text-align: '
'center;"><span style="font-family: SimSun; font-size: '
'16px;">银\u2003保\u2003监\u2003会</span></div></span><span '
'style="font-family: SimSun; font-size: 16px;"><div '
'style="text-align: center;"><span style="font-family: SimSun; '
'font-size: 16px;">国 家 铁 路 局</span></div></span><span '
'style="font-family: SimSun; font-size: 16px;"><div '
'style="text-align: center;"><span style="font-family: SimSun; '
'font-size: 16px;">民\u2003\u2003航\u2003\u2003'
'局</span></div></span><span style="font-family: SimSun; '
'font-size: 16px;"><div style="text-align: center;"><span '
'style="font-family: SimSun; font-size: 16px;">国 家 邮 政 '
'局</span></div></span><span style="font-family: SimSun; '
'font-size: 16px;"><div style="text-align: center;"><span '
'style="font-family: SimSun; font-size: '
'16px;">中国国家铁路集团有限公司</span></div></span><span '
'style="font-family: SimSun; font-size: 16px;"><div '
'style="text-align: center;"><span style="font-family: SimSun; '
'font-size: 16px;">2020年8月22日</span></div></span><div '
'style="white-space: nowrap;">\xa0</div></div>\n'
' </div><div class="attachment"><div '
'class="attachment_l">附件:</div><div class="attachment_r"><p>\n'
'<a '
'href="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy/2020/09/QtppfyVD_P020200909333031287206.pdf">《推动物流业制造业深度融合创新发展实施方案》</a>\n'
'</p></div></div>'}
2020-09-15 11:21:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/t20200904_1237640.html> (referer: https://www.ndrc.gov.cn/xxgk/zcfb/tz/index.html)
2020-09-15 11:21:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/t20200904_1237640.html>
{'biaoti': '关于促进航空货运设施发展的意见(发改基础〔2020〕1319号)',
'laiyuan': '发改委',
'lianjie': 'https://www.ndrc.gov.cn/xxgk/zcfb/tz/202009/t20200904_1237640.html',
'shijian': '2020-09-04',
'xiangqing': '<div class="article_con article_con_notitle">\n'
' <div style="text-align: center;"><span '
'style="font-family: SimSun; font-size: 24px;"><strong>国家发展改革委 '
'民航局</strong></span></div><span style="font-family: SimSun; '
'font-size: 16px;"><div style="text-align: '
'center;"><strong><span '
'style="font-size:24px;">关于促进航空货运设施发展的意见</span></strong></div></span><div '
'style="text-align: center;"><br></div><span style="font-family: '
'SimSun; font-size: 16px;"><div style="text-align: '
'center;">发改基础〔2020〕1319号</div></span><br><span '
'style="font-family: SimSun; font-size: '
'16px;">各省、自治区、直辖市及计划单列市、新疆生产建设兵团发展改革委,民航各地区管理局:</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'为深入贯彻落实国务院常务会议精神,稳定产业链和供应链,针对新冠肺炎疫情防控中暴露出我国航空货运体系存在的问题,加快补齐航空货运短板和弱项,促进我国航空货运设施发展,现提出以下意见:</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>一、总体要求</strong></span><br><span style="font-family: '
'SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(一)重要意义。</strong></span><br><span style="font-family: '
'SimSun; font-size: 16px;">\u2003\u2003'
'航空货运是国家重要的战略性资源,具有承运货物附加值高、快捷高效等特点,在应急处突、抢险救灾、军事保障等方面具有重要作用。随着我国经济由高速增长阶段转向高质量发展阶段,电子商务和快递物流业持续快速增长,航空快件比例上升,企业经营模式由货物运输为主向全产业链延伸,传统航空货运企业逐步向提供全流程服务的航空物流企业转变,新兴的航空物流企业不断涌现,迅速成长。航空货运的专业化、物流化发展趋势,对航空货运设施的布局、运行环境和效率提出了更高要求。促进航空货运设施发展,对集聚和优化航空要素资源配置、提升航空货运企业国际竞争力、促进民航业和物流业持续健康发展具有重要意义,是深化航空业供给侧结构性改革的必然选择,是航空业转型升级实现高质量发展的重要途径。各有关方面要充分认识促进航空货运设施发展的重要意义,适应发展形势,加强研究论证,促进我国航空货运和物流业有序健康发展。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(二)指导思想。</strong></span><br><span style="font-family: '
'SimSun; font-size: 16px;">\u2003\u2003'
'以习近平新时代中国特色社会主义思想为指导,全面贯彻党的十九大和十九届二中、三中、四中全会精神,坚持以人民为中心,坚持新发展理念。以深化供给侧结构性改革为主线,以货运市场需求为导向,聚焦航空货运设施发展短板和弱项,提高国际航空货运能力,畅通供应链、稳定产业链。坚持统筹兼顾、多措并举,优化资源配置,强化要素保障,充分利用既有机场的货运设施能力,科学有序推进专业性货运枢纽机场布局建设。统筹民航与铁路、公路、水运等多种交通运输方式的有效衔接和一体化协同发展,着力提升航空货运设施专业化运营能力和服务质量,逐步构建功能完善、布局合理、衔接顺畅的航空货运设施布局和通达全球的航空货运网络体系。大力培育航空货运企业,支持航空公司扩大货运机队规模,更好服务我国经济社会发展和人民美好生活对现代化航空物流的需要。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(三)基本原则。</strong></span><br><span style="font-family: '
'SimSun; font-size: 16px;">\u2003\u2003'
'<strong>市场主导、政府引导。</strong>始终坚持市场导向,充分发挥市场配置资源的决定性作用和更好发挥政府作用,坚持市场主导、企业主体、政府支持的理念,有力有序促进航空货运设施发展。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>融合发展、积极创新。</strong>充分整合各种资源,加强航空与物流新业态、新模式的深度融合,提升航空货运供给质量,适应我国物流供应链发展需求,逐步形成航空货运网络系统,培育航空运输的新增长点,形成新动能。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>客货并举、协同发展。</strong>转变“重客轻货”观念,培育专业化航空物流企业,提升货物运输专业化水平,推进机场客货并举、协同发展,打造具有国际竞争力的航空货运枢纽。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>盘活存量、优化增量。</strong>对已具备航空货运优势的机场,进一步优化完善货运设施布局和运行环境,提升效率、效益和竞争力。鼓励有条件的既有支线机场强化和提升货运功能,稳妥有序推进专业性货运枢纽机场建设。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(四)主要目标。</strong></span><br><span style="font-family: '
'SimSun; font-size: 16px;">\u2003\u2003'
'近期2025年,建成湖北鄂州专业性货运枢纽机场,优化完善北京、上海、广州、深圳等综合性枢纽机场货运设施,充分挖掘既有综合性机场的货运设施能力,结合空港型国家物流枢纽建设,研究提出由综合性枢纽机场和专业性货运枢纽机场共同组成的航空货运枢纽规划布局。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'展望2035年,在全国范围内建成1-2个专业性货运枢纽机场,并结合《全国民用运输机场布局规划》修订,进一步完善国际航空货运枢纽布局,综合性枢纽机场和专业性货运枢纽机场布局相辅相成、更加成熟。培育若干具备国际竞争力的大型航空物流企业,覆盖全球的航空货运网络骨架初步形成,航空货运发展核心要素资源配置进一步优化,设施布局进一步完善,效益显著提高,综合保障能力大幅提升,成为服务国家重大战略、促进经济结构转型升级、深度参与国际合作、推动我国经济高质量发展的有力支撑。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>二、完善提升综合性机场货运设施能力和服务品质</strong></span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(一)提高综合性机场现有货运设施能力和利用率。</strong>统筹机场客货运区域的规划、建设、运营和管理。既有机场应合理布局货运设施,充分利用既有货运资源,可通过货运设施改扩建、扩大货运区域进深尺度、合理区分全货机和客机腹舱带货区、顺畅货运车辆进出通道等措施,提升机场内货物运输的便捷性以及快速运输的高效性。新建机场应集中布局货机站坪、货运库等货运设施,优化机坪与货运设施距离和货运流线,确保场内货运组织便捷通畅。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(二)优化机场货物运输组织。</strong>加强货运枢纽机场之间的货运航线联系,鼓励结合实际需求开展空空中转等业务,充分利用客机腹舱、卡车航班、货运班列等资源,编织多层次的航空货物运输网络。建设便捷高效的机场集疏运系统,建立综合运输管理协调机制,优化运输组织实施方案,搭建综合交通信息共享及发布平台,实现货物便捷中转和快速集散。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(三)提升机场货运服务品质。</strong>完善前端收运核查,积极推动运单电子化,强化地面服务科技集成和信息化应用,优化简化货运安检流程,完善机场口岸联检设施,加强专用设备配套,不断提高通关效率。积极引入国内外航空货运处理专业力量,按照快捷高效、准时可控的要求简化货运流程,打造机场高质量货运服务体系。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(四)强化机场内外设施的协同联动。</strong>鼓励在航空业务规模较大或具备条件的机场周边规划设立临空经济区,集聚发展临空产业,符合要求的推动建设临空经济示范区、综合保税区,缩短货运设施与临空经济区和综合保税区的间距,推进机场与临空经济区、综合保税区规划建设和设施运行的高水平联动,实现区港一体化运营,提高通关效率,降低物流成本,充分发挥航空物流业引擎作用,加快形成航空物流与临空经济区之间相互促进、相互提升的共同发展态势。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>三、稳妥有序推进专业性货运枢纽机场建设</strong></span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(一)充分遵循航空货运发展规律。</strong>借鉴国际航空货运枢纽发展成功经验,结合我国航空货运发展基础和实际,多方面深化发展认知,先试点,再总结,后推广,不盲目铺摊子,不贪大求全。总结鄂州货运枢纽建设和运营经验,“十四五”期间研究提出专业性货运枢纽机场规划布局,结合市场需求稳妥有序推进建设。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(二)符合民用运输机场布局规划。</strong>支持将有条件的既有支线机场打造为专业性货运枢纽机场。新建专业性货运枢纽机场,应从《全国民用运输机场布局规划》中选取,并按照国家和民航行业关于民用运输机场建设的相关规定,履行民用运输机场基本建设程序。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(三)引入专业化航空运输企业。</strong>专业性货运枢纽机场可引入拥有全货机机队的专业化航空物流企业,由其出资建设机场专业化货运设施,并作为主基地航空公司运营。航空物流企业可与地方政府或机场签订合资合作文件,共同制定中长期运营发展规划。驻场运行全货机数量原则上大于20架。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(四)具备良好的发展条件。</strong>专业性货运枢纽机场周边空域条件良好,土地发展空间充足,满足长远发展需要。机场所在地区位优势突出,有利于构建中枢辐射式航线网络体系,且产业基础良好。综合交通运输体系较为完善,便于组织多式联运,实现货物快速集散。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>四、全面提升航空货运设施使用效能</strong></span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(一)完善飞机引进政策。</strong>优化机队结构,适应航空货运企业的发展需求,采取更加灵活的全货机引进政策,鼓励通过融资租赁、购买以及湿租等方式增加货机,支持货运航空公司壮大机队规模,发展全货机运输。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(二)持续改善空域条件。</strong>推进国家空域管理体制改革,扩大空域资源供给,推进空域灵活高效使用,依据区域特色优势和资源禀赋,结合机场功能定位,持续优化航空货运航线和时刻资源配置,加大航权开放力度,简化货运航班审批程序,为航空货运发展释放更大空间。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(三)培育航空货运企业。</strong>鼓励航空货运企业与快递物流企业打破所有制限制,加快培育具有国际竞争力的大型快递物流企业,以大型快递物流企业为主体,以其组建的货运航空公司为主基地航空公司,参与主导专业性货运枢纽机场规划、建设和运营。研究优化整合国内航空公司全货机机队资源和市场资源,培育航空货运超级承运人,提升国际航空货运市场竞争力。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(四)提升机场管理水平。</strong>建设航空物流公共信息平台,支持加快开展航空电子货运试点,研究构建“单一窗口”空港通关系统,提升航空货运信息化、标准化水平,加快民航与铁路、公路等物流标准对接,推动航空物流操作标准、信息标准、运行标准和设备标准的建设工作。加强大数据、云计算、人工智能、区块链等新技术在机场货运中的综合运用,鼓励科技创新、业务创新、管理创新,努力实现资源优化配置和精细化、智能化管控,共同打造专业化、现代化的航空物流体系。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>五、保障措施</strong></span><br><span style="font-family: '
'SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(一)加强主体责任落实。</strong>发展改革部门要加强对设施布局、综合交通方式的统筹协调,做好规划政策整合。民航行业管理部门要加强对航空货运设施建设发展的行业管理和指导,统筹把握发展进程和行业标准制定,减少无序竞争和重复建设。积极发挥民航行业协会和社会团体的管理、服务和协调作用,有力支持航空货运设施持续健康发展。切实强化航空运输企业和机场公司在航空货运发展方面的协作合作,实现共商共建共享共赢。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(二)做好资源要素保障。</strong>坚持机场的公共基础设施属性,鼓励地方政府加大对机场货运及其重要配套设施的规划选址、土地使用、建设运营等要素支持,严格规划用地预留和控制,统筹综合交通基础设施布局,落实《国家发展改革委关于促进枢纽机场联通轨道交通的意见》,同步规划临空物流园区等配套设施建设。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(三)创新设施融资模式。</strong>鼓励借鉴国际经验,由地方政府、机场公司、航空物流企业以及社会资本多方合作,采取BOT、BOO、BOOT等多种模式开发建设和管理货运设施,由航空物流企业出资建设适合自身运营发展需要的转运中心、航空货站、仓储设施等,实施专业化运营。</span><br><span '
'style="font-family: SimSun; font-size: 16px;">\u2003\u2003'
'<strong>(四)强化专业人才培养。</strong>加大民航管理技术人才、航空物流专业人才培养以及科技研发投入支持力度,鼓励科研机构建立航空物流研究方向和航空物流领域行业智库,充分发挥专业智库在航空货运发展中的政策咨询和技术支持作用,提升航空物流企业和机场货运设施运营管理能力和水平。</span><br><br><div '
'style="text-align: center;"><span style="font-family: SimSun; '
'font-size: 16px;">国家发展</span><span style="font-family: SimSun; '
'font-size: 16px;">改革委</span></div><span style="font-family: '
'SimSun; font-size: 16px;"><div style="text-align: '
'center;"><span style="font-family: SimSun; font-size: '
'16px;">民\u2003\u2003航\u2003\u2003局</span></div></span><span '
'style="font-family: SimSun; font-size: 16px;"><div '
'style="text-align: center;"><span style="font-family: SimSun; '
'font-size: 16px;">2020年8月24日</span></div></span><div '
'style="white-space: nowrap;"><br></div>\n'
' </div><div class="attachment"><div '
'class="attachment_l">附件:</div><div class="attachment_r"><p>\n'
'\n'
'</p></div></div>'}
2020-09-15 11:21:55 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:21:55 [root] INFO: 爬虫运行完毕了
2020-09-15 11:21:55 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1518,
'downloader/request_count': 5,
'downloader/request_method_count/GET': 5,
'downloader/response_bytes': 34621,
'downloader/response_count': 5,
'downloader/response_status_count/200': 5,
'elapsed_time_seconds': 5.371551,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 21, 55, 791302),
'item_scraped_count': 4,
'log_count/DEBUG': 9,
'log_count/INFO': 32,
'request_depth_max': 1,
'response_received_count': 5,
'scheduler/dequeued': 5,
'scheduler/dequeued/memory': 5,
'scheduler/enqueued': 5,
'scheduler/enqueued/memory': 5,
'start_time': datetime.datetime(2020, 9, 15, 3, 21, 50, 419751)}
2020-09-15 11:21:55 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet Password: 3a984af3a603947f
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-16 08:47:16 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:16 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6028
2020-09-16 08:47:16 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/wenhuahelvyoubu_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

208
demo1/logs/fazhancujinju_2020_9.log

@ -0,0 +1,208 @@
2020-09-15 10:49:28 [scrapy.extensions.telnet] INFO: Telnet Password: d9426dcc597a68e4
2020-09-15 10:49:28 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 10:49:29 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 10:49:29 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 10:49:29 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 10:49:29 [scrapy.core.engine] INFO: Spider opened
2020-09-15 10:49:29 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 10:49:29 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 10:49:29 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://xqyj.shanxi.gov.cn/v2/html/tzgg/index.html> (referer: None)
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200128/8882.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200914/10731.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200911/10719.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200907/10684.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200902/10659.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200901/10647.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200831/10634.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200828/10617.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200826/10606.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200825/10593.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200824/10581.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200819/10540.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200817/10529.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200817/10524.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200814/10513.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200813/10498.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200812/10491.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200812/10490.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200812/10492.html
2020-09-15 10:49:29 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200810/10474.html
2020-09-15 10:49:29 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 10:49:29 [root] INFO: 爬虫运行完毕了
2020-09-15 10:49:29 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 245,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 21572,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.559779,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 2, 49, 29, 679066),
'log_count/DEBUG': 1,
'log_count/INFO': 31,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 9, 15, 2, 49, 29, 119287)}
2020-09-15 10:49:29 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-15 10:51:05 [scrapy.extensions.telnet] INFO: Telnet Password: 6319e42fc51397f0
2020-09-15 10:51:05 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 10:51:05 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 10:51:05 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 10:51:05 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 10:51:05 [scrapy.core.engine] INFO: Spider opened
2020-09-15 10:51:05 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 10:51:05 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 10:51:05 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://xqyj.shanxi.gov.cn/v2/html/tzgg/index.html> (referer: None)
2020-09-15 10:51:10 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200128/8882.html
2020-09-15 10:52:41 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200914/10731.html
2020-09-15 10:53:03 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200911/10719.html
2020-09-15 11:23:14 [scrapy.extensions.telnet] INFO: Telnet Password: 908b6b91a251874b
2020-09-15 11:23:14 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:23:14 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:23:14 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:23:14 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:23:14 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:23:14 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:23:14 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:23:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://xqyj.shanxi.gov.cn/v2/html/tzgg/index.html> (referer: None)
2020-09-15 11:23:14 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200128/8882.html
2020-09-15 11:23:14 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200914/10731.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200911/10719.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200907/10684.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200902/10659.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200901/10647.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200831/10634.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200828/10617.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200826/10606.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200825/10593.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200824/10581.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200819/10540.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200817/10529.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200817/10524.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200814/10513.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200813/10498.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200812/10491.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200812/10490.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200812/10492.html
2020-09-15 11:23:15 [root] INFO: 这个链接已经爬过了-----:http://xqyj.shanxi.gov.cn/v2/html/tzgg/20200810/10474.html
2020-09-15 11:23:15 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:23:15 [root] INFO: 爬虫运行完毕了
2020-09-15 11:23:15 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 245,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 21572,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.65274,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 23, 15, 347368),
'log_count/DEBUG': 1,
'log_count/INFO': 31,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 9, 15, 3, 23, 14, 694628)}
2020-09-15 11:23:15 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet Password: 66d535e49cbb95b0
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-16 08:47:16 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:16 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6031
2020-09-16 08:47:16 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/sxkejiting_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

1549
demo1/logs/gongyehexinxihuabu_2020_9.log

File diff suppressed because it is too large

560
demo1/logs/huojuzhongxin_2020_9.log

@ -0,0 +1,560 @@
2020-09-15 11:19:59 [scrapy.extensions.telnet] INFO: Telnet Password: de017f73eeeebee1
2020-09-15 11:19:59 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:19:59 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:19:59 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:19:59 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:19:59 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:19:59 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:19:59 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6024
2020-09-15 11:19:59 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.nsfc.gov.cn/publish/portal0/tab442/module1178/page1.htm> (referer: None)
2020-09-15 11:19:59 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.chinatorch.gov.cn/kjb/tzgg/list.shtml> (referer: None)
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78356.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78152.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202008/31968fc6f34141f7b00ca1a4a9403b9b.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202007/62c204b65c5743e3bfa42b11f7f74e07.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202007/4d0a9a00609849ce8f7f92f8deefc0f1.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202007/72fc74e9f18e41a09e793db6ad1e57d5.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202007/ca4e198b01dc4d9ca3b7280193db6e43.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202007/5770a68db55b4122ac26d3b320fcd47d.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202007/687f7fe1fb554ffca065405d3ad58ff9.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/a1aa35a2ca6342129f91751a67d301eb.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/98f4eba5be7c44de86dcaeb5c52685a1.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/5999c535089542f2aba8661cd76f7f51.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/ebb7f4834b96465d92adbe7f9b59172e.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/9b5772a6b34946038857e345a74293bd.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/5d569f0bd065466ea650f8f3181f0f28.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/38bcefd694bc4aadbd61b4cd503c1a66.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202005/3b861ac8ca754865a9ad8cec9c91a461.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202005/3dd5ac8e23424ec9a2efdb013c07f1e1.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202005/0fbf02c759504c01b5b7b1485081bc52.shtml
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78522.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78519.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78513.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78512.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78478.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78477.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78463.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78460.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78459.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78457.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78444.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78437.htm
2020-09-15 11:19:59 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78435.htm
2020-09-15 11:20:00 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.chinatorch.gov.cn/kjb/tzgg/202009/333a6315e6e847598015470c023e6756.shtml> (referer: http://www.chinatorch.gov.cn/kjb/tzgg/list.shtml)
2020-09-15 11:20:00 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.nsfc.gov.cn/publish/portal0/tab442/info78573.htm> (referer: http://www.nsfc.gov.cn/publish/portal0/tab442/module1178/page1.htm)
2020-09-15 11:20:00 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.chinatorch.gov.cn/kjb/tzgg/202009/333a6315e6e847598015470c023e6756.shtml>
{'biaoti': '关于举办2020年国家高新区主任培训班的通知',
'laiyuan': '科技部火炬中心',
'lianjie': 'http://www.chinatorch.gov.cn/kjb/tzgg/202009/333a6315e6e847598015470c023e6756.shtml',
'shijian': '2020-09-03',
'wenjian': [{'file_name': '附件:2020年度国家高新区主任培训班报名表',
'file_url': 'http://www.chinatorch.gov.cn333a6315e6e847598015470c023e6756/files/80e9462717b54484acb6ce68eaf9a001.docx',
'new_file': '/2020/09/HJDi7NGg_80e9462717b54484acb6ce68eaf9a001.docx'},
{'file_name': '_',
'file_url': 'http://www.chinatorch.gov.cn/default/images/icon16/doc.gif',
'new_file': '/2020/09/lw6Z6JGe_doc.gif'},
{'file_name': '_',
'file_url': 'http://www.chinatorch.gov.cn333a6315e6e847598015470c023e6756/images/e2b42f4c281042b5a536d56c1b40e60b.png',
'new_file': '/2020/09/0Enc5MRi_e2b42f4c281042b5a536d56c1b40e60b.png'}],
'xiangqing': '<div class="pages_content" id="content">\r\n'
'\t\t\t<div>\r\n'
' \t<p style="text-align: center; line-height: 1.5; '
'text-indent: 2em; font-family: 宋体; font-size: '
'12pt;">国科火字〔2020〕155号</p>\r\n'
'<p style="text-align: center; line-height: 1.5; text-indent: '
'2em; font-family: 宋体; font-size: 12pt;"><br>\r\n'
'</p>\r\n'
'<p style="text-align: center; line-height: 1.5; text-indent: '
'2em; font-family: 宋体; font-size: '
'12pt;"><strong>关于举办2020年国家高新区主任培训班的通知</strong></p>\r\n'
'<p style="text-align: center; line-height: 1.5; text-indent: '
'2em; font-family: 宋体; font-size: 12pt;"><br>\r\n'
'</p>\r\n'
'<p style="line-height: 1.5; font-family: 宋体; font-size: '
'12pt;">各国家高新区管委会:</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: '
'12pt;">日前,国务院发布了《国务院关于促进国家高新技术产业开发区高质量发展的若干意见》(国发〔2020〕7号),明确了新时代国家高新区的定位和目标,对国家高新区的下一步发展进行了全面部署。根据科技部2020年培训工作安排,科技部火炬中心将举办“2020年国家高新区主任培训班”。现将有关事项通知如下:</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">一、培训内容</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: '
'12pt;">本期培训班以深入学习贯彻《国务院关于促进国家高新技术产业开发区高质量发展的若干意见》(国发〔2020〕7号)为主题,邀请从事高新区管理的领导及有关专家学者,通过专题讲座、案例教学、交流研讨等形式进行培训。主要内容包括:学习和贯彻《国务院关于促进国家高新技术产业开发区高质量发展的若干意见》具体要求,学习和贯彻科技部党组要求,学习和了解国家高新区面临的国际和国内新形势、新问题,学习和研讨高新区在提升科技创新能力、产业发展、绿色发展、改革创新等方面的实务和案例。</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">二、培训对象</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: '
'12pt;">各国家高新区管委会主要负责同志,每单位1人,共169人。(新冠中、高风险区的国家高新区可视具体情况确定报名参加)</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">三、培训时间和地点</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">培训时间:9月21日-24日,9月20日全天报到。</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">培训地点:江苏省苏州市苏州工业园区启月街299号(苏州独墅湖会议中心)。</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">四、培训费用</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">学员培训期间的食宿由培训班统一安排,不收取费用。</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">五、报名方式</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: '
'12pt;">请各国家高新区于2020年9月11日前,将“2020年度国家高新区主任培训班报名表”(详见附件)以电子邮件的方式分别发至科技部火炬中心及苏州工业园。</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">六、联系方式</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">1.苏州工业园</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">联系人:顾凡、申晨曦、张君达、程斌</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">电话:0512-66681633;0512-66681635;</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">13862166455;15850594613</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">传真:0512-66681699</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">邮箱:training@sipac.gov.cn</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">2.科技部火炬中心</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">联系人:魏颖、庞林花</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: '
'12pt;">电话:010-88656193;010-88656199;010-88656175</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">传真:010-88656190</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">邮箱:<a '
'href="mailto:gaoxq2@ctp.gov.cn">gaoxq2@ctp.gov.cn</a></p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;"><br>\r\n'
'</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;"><img border="0" '
'src="https://www.sxwikionline.com/staticrec/policy/2020/09/lw6Z6JGe_doc.gif"><a '
'href="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy/2020/09/HJDi7NGg_80e9462717b54484acb6ce68eaf9a001.docx" '
'target="_blank" type="file">附件:2020年度国家高新区主任培训班报名表</a><br>\r\n'
'</p>\r\n'
'<p style="text-align: right; line-height: 1.5; text-indent: '
'2em; font-family: 宋体; font-size: 12pt;"><br>\r\n'
'</p>\r\n'
'<p style="text-align: right; line-height: 1.5; text-indent: '
'2em; font-family: 宋体; font-size: 12pt;">科技部火炬中心</p>\r\n'
'<p style="text-align: right; line-height: 1.5; text-indent: '
'2em; font-family: 宋体; font-size: 12pt;">2020年9月1日</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;">(此件主动公开)</p>\r\n'
'<p style="line-height: 1.5; text-indent: 2em; font-family: 宋体; '
'font-size: 12pt;"><br>\r\n'
'</p>\r\n'
'<p style="text-align: center; line-height: 1.5; text-indent: '
'2em; font-family: 宋体; font-size: 12pt;"><img border="0" '
'src="https://www.sxwikionline.com/staticrec/policy/2020/09/0Enc5MRi_e2b42f4c281042b5a536d56c1b40e60b.png"></p>\r\n'
'\t\t\t</div>\r\n'
' </div>'}
2020-09-15 11:20:00 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:20:00 [root] INFO: 爬虫运行完毕了
2020-09-15 11:20:00 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 582,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 23109,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'elapsed_time_seconds': 1.548075,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 20, 0, 703358),
'item_scraped_count': 1,
'log_count/DEBUG': 5,
'log_count/INFO': 43,
'request_depth_max': 1,
'response_received_count': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2020, 9, 15, 3, 19, 59, 155283)}
2020-09-15 11:20:00 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-15 11:20:00 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.nsfc.gov.cn/publish/portal0/tab442/info78573.htm>
{'biaoti': '关于发布碳基能源转化利用的催化科学重大研究计划2020年度项目指南的通告',
'laiyuan': '国家自然科学基金委员会',
'lianjie': 'http://www.nsfc.gov.cn/publish/portal0/tab442/info78573.htm',
'shijian': '2020-09-09',
'xiangqing': '<div class="content_xilan">\n'
'<table cellspacing="0" cellpadding="0" border="0" '
'width="100%">\n'
' <tbody>\n'
' <tr>\n'
' <td style="height: 10px;"> </td>\n'
' </tr>\n'
' <tr>\n'
' <td align="center">\n'
'<font face="����"></font>\n'
'</td>\n'
' </tr>\n'
' </tbody>\n'
'</table>\n'
'<br>\n'
'<span class="normal105" id="zoom"><!--ContentStart--><p '
'style="text-align: center;">国科金发计〔2020〕61号</p>\n'
'<p>\xa0</p>\n'
'<p>国家自然科学基金委员会现发布“碳基能源转化利用的催化科学”重大研究计划2020年度项目指南,请申请人及依托单位按项目指南中所述的要求和注意事项申请。</p>\n'
'<p>\xa0</p>\n'
'<p><a href="/publish/portal0/tab568/info78572.htm" '
'style="text-decoration:underline">附件:“碳基能源转化利用的催化科学”重大研究计划2020年度项目指南</a></p>\n'
'<p>\xa0</p>\n'
'<p style="text-align: right;">国家自然科学基金委员会</p>\n'
'<p style="text-align: '
'right;">2020年9月8日</p><!--ContentEnd--></span></div>'}
2020-09-15 11:20:01 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.nsfc.gov.cn/publish/portal0/tab442/info78571.htm> (referer: http://www.nsfc.gov.cn/publish/portal0/tab442/module1178/page1.htm)
2020-09-15 11:20:02 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.nsfc.gov.cn/publish/portal0/tab442/info78571.htm>
{'biaoti': '关于发布多相反应过程中的介尺度机制及调控重大研究计划2020年度项目指南的通告',
'laiyuan': '国家自然科学基金委员会',
'lianjie': 'http://www.nsfc.gov.cn/publish/portal0/tab442/info78571.htm',
'shijian': '2020-09-09',
'xiangqing': '<div class="content_xilan">\n'
'<table cellspacing="0" cellpadding="0" border="0" '
'width="100%">\n'
' <tbody>\n'
' <tr>\n'
' <td style="height: 10px;"> </td>\n'
' </tr>\n'
' <tr>\n'
' <td align="center">\n'
'<font face="����"></font>\n'
'</td>\n'
' </tr>\n'
' </tbody>\n'
'</table>\n'
'<br>\n'
'<span class="normal105" id="zoom"><!--ContentStart--><p '
'style="text-align: center;">国科金发计〔2020〕60号</p>\n'
'<p style="text-align: center;">\xa0</p>\n'
'<p>国家自然科学基金委员会现发布“多相反应过程中的介尺度机制及调控”重大研究计划2020年度项目指南,请申请人及依托单位按项目指南中所述的要求和注意事项申请。</p>\n'
'<p>\xa0</p>\n'
'<p><a href="/publish/portal0/tab568/info78570.htm" '
'style="text-decoration:underline">附件:“多相反应过程中的介尺度机制及调控”重大研究计划2020年度项目指南</a> '
'</p>\n'
'<p>\xa0</p>\n'
'<p style="text-align: right;">国家自然科学基金委员会</p>\n'
'<p style="text-align: '
'right;">2020年9月8日</p><!--ContentEnd--></span></div>'}
2020-09-15 11:20:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.nsfc.gov.cn/publish/portal0/tab442/info78569.htm> (referer: http://www.nsfc.gov.cn/publish/portal0/tab442/module1178/page1.htm)
2020-09-15 11:20:03 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.nsfc.gov.cn/publish/portal0/tab442/info78569.htm>
{'biaoti': '关于发布团簇构造、功能及多级演化重大研究计划2020年度项目指南的通告',
'laiyuan': '国家自然科学基金委员会',
'lianjie': 'http://www.nsfc.gov.cn/publish/portal0/tab442/info78569.htm',
'shijian': '2020-09-09',
'xiangqing': '<div class="content_xilan">\n'
'<table cellspacing="0" cellpadding="0" border="0" '
'width="100%">\n'
' <tbody>\n'
' <tr>\n'
' <td style="height: 10px;"> </td>\n'
' </tr>\n'
' <tr>\n'
' <td align="center">\n'
'<font face="����"></font>\n'
'</td>\n'
' </tr>\n'
' </tbody>\n'
'</table>\n'
'<br>\n'
'<span class="normal105" id="zoom"><!--ContentStart--><p '
'style="text-align: center;">国科金发计&gt;〔2020〕59号</p>\n'
'<p>\xa0</p>\n'
'<p>国家自然科学基金委员会现发布“团簇构造、功能及多级演化”重大研究计划2020年度项目指南,请申请人及依托单位按项目指南中所述的要求和注意事项申请。</p>\n'
'<p>\xa0</p>\n'
'<p><a href="/publish/portal0/tab568/info78568.htm" '
'style="text-decoration:underline">附件:“团簇构造、功能及多级演化”重大研究计划2020年度项目指南</a></p>\n'
'<p>\xa0</p>\n'
'<p style="text-align: right;">国家自然科学基金委员会</p>\n'
'<p style="text-align: '
'right;">2020年9月8日</p><!--ContentEnd--></span></div>'}
2020-09-15 11:20:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.nsfc.gov.cn/publish/portal0/tab442/info78567.htm> (referer: http://www.nsfc.gov.cn/publish/portal0/tab442/module1178/page1.htm)
2020-09-15 11:20:04 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.nsfc.gov.cn/publish/portal0/tab442/info78567.htm>
{'biaoti': '关于发布大气细颗粒物的毒理与健康效应重大研究计划2020年度项目指南的通告',
'laiyuan': '国家自然科学基金委员会',
'lianjie': 'http://www.nsfc.gov.cn/publish/portal0/tab442/info78567.htm',
'shijian': '2020-09-09',
'xiangqing': '<div class="content_xilan">\n'
'<table cellspacing="0" cellpadding="0" border="0" '
'width="100%">\n'
' <tbody>\n'
' <tr>\n'
' <td style="height: 10px;"> </td>\n'
' </tr>\n'
' <tr>\n'
' <td align="center">\n'
'<font face="����"></font>\n'
'</td>\n'
' </tr>\n'
' </tbody>\n'
'</table>\n'
'<br>\n'
'<span class="normal105" id="zoom"><!--ContentStart--><p '
'style="text-align: center;">国科金发计〔2020〕58号</p>\n'
'<p>\xa0</p>\n'
'<p>国家自然科学基金委员会现发布“大气细颗粒物的毒理与健康效应”重大研究计划2020年度项目指南,请申请人及依托单位按项目指南中所述的要求和注意事项申请。</p>\n'
'<p>\xa0</p>\n'
'<p><a href="/publish/portal0/tab568/info78566.htm" '
'style="text-decoration:underline">附件:“大气细颗粒物的毒理与健康效应”重大研究计划2020年度项目指南</a></p>\n'
'<p>\xa0</p>\n'
'<p style="text-align: right;">国家自然科学基金委员会</p>\n'
'<p style="text-align: '
'right;">2020年9月8日</p><!--ContentEnd--></span></div>'}
2020-09-15 11:20:05 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.nsfc.gov.cn/publish/portal0/tab442/info78537.htm> (referer: http://www.nsfc.gov.cn/publish/portal0/tab442/module1178/page1.htm)
2020-09-15 11:20:05 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.nsfc.gov.cn/publish/portal0/tab442/info78537.htm>
{'biaoti': '2020年度国家自然科学基金委员会与芬兰科学院合作交流项目指南',
'laiyuan': '国家自然科学基金委员会',
'lianjie': 'http://www.nsfc.gov.cn/publish/portal0/tab442/info78537.htm',
'shijian': '2020-09-04',
'wenjian': [{'file_name': '1.合作交流计划书撰写说明',
'file_url': 'http://www.nsfc.gov.cn/Portals/0/fj/fj20200904_01.docx',
'new_file': '/2020/09/WQ12JV8o_fj20200904_01.docx'},
{'file_name': '2.双边研讨会计划书',
'file_url': 'http://www.nsfc.gov.cn/Portals/0/fj/fj20200904_02.docx',
'new_file': '/2020/09/rZ1dWEN3_fj20200904_02.docx'},
{'file_name': '3.合作协议模板',
'file_url': 'http://www.nsfc.gov.cn/Portals/0/fj/fj20200904_03.docx',
'new_file': '/2020/09/RF5gdArQ_fj20200904_03.docx'}],
'xiangqing': '<div class="content_xilan">\n'
'<table cellspacing="0" cellpadding="0" border="0" '
'width="100%">\n'
' <tbody>\n'
' <tr>\n'
' <td style="height: 10px;"> </td>\n'
' </tr>\n'
' <tr>\n'
' <td align="center">\n'
'<font face="����"></font>\n'
'</td>\n'
' </tr>\n'
' </tbody>\n'
'</table>\n'
'<br>\n'
'<span class="normal105" '
'id="zoom"><!--ContentStart--><p>根据国家自然科学基金委员会(NSFC)与芬兰科学院(AF)的合作协议及双边工作计划,双方于2020年共同资助中国与芬兰科研人员在科学研究基础上开展的合作交流和双边研讨会项目。</p>\n'
'<p><b>一、</b><b>项目说明</b><b></b></p>\n'
'<p>(一)资助领域。</p>\n'
'<p>无领域限制。</p>\n'
'<p>(二)资助强度。</p>\n'
'<p>中方资助强度为不超过10万元/项。<b></b></p>\n'
'<p>(三)资助内容。</p>\n'
'<p>对于合作交流项目,自然科学基金委资助中方研究人员访芬的国际旅费和在芬兰访问期间的住宿费、伙食费、城市间交通费。芬兰科学院资助芬方研究人员访华的相关费用。</p>\n'
'<p>对于在中国召开的双边研讨会,自然科学基金委资助中方举办会议所需的会议费和中方参会人员住宿费、伙食费及城市间交通费。芬兰科学院资助芬方参会人员的相关费用。</p>\n'
'<p>对于在芬兰召开的双边研讨会,自然科学基金委资助中方研究人员访芬的国际旅费和在芬兰开会期间的住宿费、伙食费、城市间交通费。芬兰科学院资助芬方举办会议所需的会议费和芬方参会人员的相关费用。</p>\n'
'<p>(四)项目执行期。</p>\n'
'<p>合作交流项目执行期为2年(项目起止日期为2021年3月1日至2023年2月28日)。</p>\n'
'<p>双边研讨会项目执行期为1年(项目起止日期为2021年3月1日至2022年2月28日)。</p>\n'
'<p><b>二、</b><b>申请资格</b><b></b></p>\n'
'<p>(一)中方申请人须是2021年12月31日(含)以后结题的3年期(含)以上国家自然科学基金在研项目(合作交流项目除外)的主持人或主要参与者(在研项目的主要参与者作为中方申请人须具有高级专业技术职务职称或博士学位,或有2名与其研究领域相同、具有高级专业技术职务职称的科学技术人员推荐,并经在研项目负责人同意),并依托该在研基金项目提交申请。合作交流应密切围绕所依托在研基金项目的研究内容。</p>\n'
'<p>(二)芬方合作者应符合芬兰科学院对本国申请人的资格要求。</p>\n'
'<p>(三)中芬双方申请人须分别向国家自然科学基金委员会和芬兰科学院递交项目申请,单方申请将不予受理。芬方申请指南详见:</p>\n'
'<p>https://www.aka.fi/en/funding/apply-for-funding/for-researchers/call-for-funding-for-international-researcher-mobility-based-on-bilateral-agreements/。</p>\n'
'<p>(四)更多关于申请资格的说明,请见《2020年度国家自然科学基金项目指南》。</p>\n'
'<p><b>三、限项规定</b><b></b></p>\n'
'<p>(一)本项目属于国际(地区)合作交流项目,不受“高级专业技术职务(职称)人员申请和正在承担的项目总数限为2项”规定的限制。</p>\n'
'<p>(二)作为申请人申请和作为项目负责人正在承担的NSFC-AF(中芬)合作交流和双边研讨会项目,合计限1项。</p>\n'
'<p>(三)更多关于限项规定的说明,请见《2020年度国家自然科学基金项目指南》。</p>\n'
'<p><b>四、申报要求</b><b></b></p>\n'
'<p>(一)在线填报路径。<b></b></p>\n'
'<p>中方申请人须登录ISIS科学基金网络系统(https://isisn.nsfc.gov.cn/egrantweb/),在线填报《国家自然科学基金国际(地区)合作交流项目申请书》。具体步骤如下:</p>\n'
'<p>1. '
'选择“项目负责人”用户组登录系统,进入后点击“在线申请”进入申请界面;点击“新增项目申请”按钮进入项目类别选择界面。</p>\n'
'<p>2. 点击“国际(地区)合作与交流项目”左侧“+”号或者右侧“展开”按钮,展开下拉菜单。</p>\n'
'<p>3. '
'对于合作交流项目,点击“合作交流(组织间协议项目)”右侧的“填写申请”按钮,进入选择“合作协议”界面,在下拉菜单中选择“NSFC-AF(芬兰)”,然后按系统要求输入依托在研基金项目的批准号(作为负责人承担的3年期及以上科学基金项目批准号),进入具体申请书填写界面。</p>\n'
'<p>对于在中国召开的双边研讨会,点击“<strong>在华召开国际(地区)学术会议</strong>”右侧的“填写申请”按钮,进入选择“合作协议”界面,在下拉菜单中选择“NSFC-AF(中芬)”,然后按系统要求输入依托在研基金项目的批准号(作为负责人承担的3年期及以上科学基金项目批准号),进入具体申请书填写界面。</p>\n'
'<p>对于在芬兰召开的双边研讨会,点击“<strong>出国(境)参加双(多)边会议</strong>”右侧的“填写申请”按钮,进入选择“合作协议”界面,在下拉菜单中选择“NSFC-AF(中芬)”,然后按系统要求输入依托在研基金项目的批准号(作为负责人承担的3年期及以上科学基金项目批准号),进入具体申请书填写界面。</p>\n'
'<p>(二)申请书填写说明。</p>\n'
'<p>中芬双方申请书中的项目名称(英文)、双方依托单位和双方项目负责人(默认为“中方人信息”栏目和“境外合作人员”栏目的第一人)应严格一致。</p>\n'
'<p>在“项目执行计划”栏目,应按照交流年度,详细列出出访及来访人员姓名、出访及来访日期和拟开展的研究工作内容,或双边研讨会的日程安排和拟参会人员信息。</p>\n'
'<p>本项目无间接费用,中方经费填写仅限经费预算表格中的第9项“差旅/会议/国际合作与交流费”栏。在“预算说明书”栏目,应按照“项目执行计划”的内容,按交流年度为出访人员的国际旅费、住宿费、伙食费、城市间交通费,或中芬双边研讨会制定详细预算。</p>\n'
'<p>(三)在线提交附件材料。</p>\n'
'<p>除在线填写并提交中文申请书外,中方申请人须将下列材料上传至中文申请书的“附件”栏中一同提交:</p>\n'
'<p>1. '
'合作交流项目须提交与芬方合作者联合撰写的合作交流计划书(撰写说明见附件1)、芬方申请人及参与者简历、及中芬双方申请人签署的合作交流协议(协议模板见附件3)。</p>\n'
'<p>2. 双边研讨会项目须提交与芬方合作者联合撰写的双边研讨会计划书(撰写说明见附件2)。</p>\n'
'<p>(四)报送材料。</p>\n'
'<p>依托单位应对本单位申请人所提交申请材料的真实性、完整性和合规性,申报预算的目标相关性、政策相符性和经济合理性进行审核。本项目纳入无纸化申请范围,依托单位完成电子申请书及附件材料的逐项确认后,应于申请材料提交截止时间前通过ISIS科学基金网络系统上传本单位科研诚信承诺书的电子扫描件(请在ISIS科学基金网络系统中下载模板,打印填写后由法定代表人签字、依托单位加盖公章),无需提供纸质材料。ISIS系统在线申报接收期为2020年9月4日至2020年9月30日16时。</p>\n'
'<p>项目获批准后,将申请书的纸质签字盖章页装订在《资助项目计划书》最后,一并提交。签字盖章的信息应与电子申请书严格保持一致。</p>\n'
'<p><b>注:</b><b>请申请人严格遵照本项目指南的各项要求填报申请,不符合上述要求的申请将不予受理。如有疑问,请致电项目联系人。</b><b></b></p>\n'
'<p><b>五、结果公布</b><b></b></p>\n'
'<p>2021年年初将在国家自然科学基金委员会门户网站国际合作栏目中公布资助结果。</p>\n'
'<p><b>六、项目联系人</b><b></b></p>\n'
'<p>中方联系人:申 洁</p>\n'
'<p>电 话:010-6232 7017</p>\n'
'<p>Email: shenjie@nsfc.gov.cn</p>\n'
'<p>中方申请人在线填写申请书过程中如遇到技术问题,可联系自然科学基金委ISIS系统技术支持。</p>\n'
'<p>电 话:010-6231 7474</p>\n'
'<p>\xa0</p>\n'
'<p>芬方联系人:Siru Oksa</p>\n'
'<p>Email: siru.oksa@aka.fi</p>\n'
'<p>UllaEllmén</p>\n'
'<p>Email: ulla.ellmen@aka.fi</p>\n'
'<p>\xa0</p>\n'
'<p>附件:<a '
'href="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy/2020/09/WQ12JV8o_fj20200904_01.docx" '
'target="_blank"><span style="color: #0070c0; text-decoration: '
'underline;">1.合作交流计划书撰写说明</span></a></p>\n'
'<p><a '
'href="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy/2020/09/rZ1dWEN3_fj20200904_02.docx" '
'target="_blank"><span style="color: #0070c0; text-decoration: '
'underline;">2.双边研讨会计划书</span></a></p>\n'
'<p><a '
'href="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy/2020/09/RF5gdArQ_fj20200904_03.docx" '
'target="_blank"><span style="color: #0070c0; text-decoration: '
'underline;">3.合作协议模板</span></a></p>\n'
'<p>\xa0</p>\n'
'<p style="text-align: right;">国家自然科学基金委员会</p>\n'
'<p style="text-align: right;">国际合作局</p>\n'
'<p style="text-align: '
'right;">2020年9月4日</p><!--ContentEnd--></span></div>'}
2020-09-15 11:20:05 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:20:05 [root] INFO: 爬虫运行完毕了
2020-09-15 11:20:05 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1922,
'downloader/request_count': 6,
'downloader/request_method_count/GET': 6,
'downloader/response_bytes': 38806,
'downloader/response_count': 6,
'downloader/response_status_count/200': 6,
'elapsed_time_seconds': 6.718878,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 20, 5, 791092),
'item_scraped_count': 5,
'log_count/DEBUG': 14,
'log_count/INFO': 56,
'request_depth_max': 1,
'response_received_count': 6,
'scheduler/dequeued': 6,
'scheduler/dequeued/memory': 6,
'scheduler/enqueued': 6,
'scheduler/enqueued/memory': 6,
'start_time': datetime.datetime(2020, 9, 15, 3, 19, 59, 72214)}
2020-09-15 11:20:05 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-15 11:20:39 [scrapy.extensions.telnet] INFO: Telnet Password: ae67de1bc4c91e21
2020-09-15 11:20:39 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:20:39 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:20:39 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:20:39 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:20:39 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:20:39 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:20:39 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:20:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.chinatorch.gov.cn/kjb/tzgg/list.shtml> (referer: None)
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202009/333a6315e6e847598015470c023e6756.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202008/31968fc6f34141f7b00ca1a4a9403b9b.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202007/62c204b65c5743e3bfa42b11f7f74e07.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202007/4d0a9a00609849ce8f7f92f8deefc0f1.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202007/72fc74e9f18e41a09e793db6ad1e57d5.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202007/ca4e198b01dc4d9ca3b7280193db6e43.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202007/5770a68db55b4122ac26d3b320fcd47d.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202007/687f7fe1fb554ffca065405d3ad58ff9.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/a1aa35a2ca6342129f91751a67d301eb.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/98f4eba5be7c44de86dcaeb5c52685a1.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/5999c535089542f2aba8661cd76f7f51.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/ebb7f4834b96465d92adbe7f9b59172e.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/9b5772a6b34946038857e345a74293bd.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/5d569f0bd065466ea650f8f3181f0f28.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202006/38bcefd694bc4aadbd61b4cd503c1a66.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202005/3b861ac8ca754865a9ad8cec9c91a461.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202005/3dd5ac8e23424ec9a2efdb013c07f1e1.shtml
2020-09-15 11:20:40 [root] INFO: 这个链接已经爬过了-----:http://www.chinatorch.gov.cn/kjb/tzgg/202005/0fbf02c759504c01b5b7b1485081bc52.shtml
2020-09-15 11:20:40 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:20:40 [root] INFO: 爬虫运行完毕了
2020-09-15 11:20:40 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 244,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 11236,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.543031,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 20, 40, 474551),
'log_count/DEBUG': 1,
'log_count/INFO': 29,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 9, 15, 3, 20, 39, 931520)}
2020-09-15 11:20:40 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet Password: d2f89b4457c344ba
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-16 08:47:16 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:16 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6027
2020-09-16 08:47:16 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/fagaiwei_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

321
demo1/logs/kexujishubu_2020_9.log

@ -0,0 +1,321 @@
2020-09-15 11:18:28 [scrapy.extensions.telnet] INFO: Telnet Password: 99625f5fb67f01df
2020-09-15 11:18:28 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:18:29 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:18:29 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:18:29 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.kexujishubuPipeline']
2020-09-15 11:18:29 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:18:29 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:18:29 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:18:29 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.most.gov.cn/tztg/index.htm> (referer: None)
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202009/t20200902_158634.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202008/t20200828_158545.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202008/t20200827_158488.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202008/t20200826_158478.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202008/t20200824_158442.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202008/t20200820_158368.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202008/t20200820_158367.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202008/t20200820_158366.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202008/t20200818_158360.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202008/t20200812_158357.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202008/t20200806_158250.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202007/t20200731_158051.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202007/t20200729_158039.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202007/t20200728_158029.htm
2020-09-15 11:18:29 [root] INFO: 这个链接已经爬过了-----:http://www.most.gov.cn/tztg/202007/t20200728_158024.htm
2020-09-15 11:18:30 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.most.gov.cn/tztg/202009/t20200914_158714.htm> (referer: http://www.most.gov.cn/tztg/index.htm)
2020-09-15 11:18:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.most.gov.cn/tztg/202009/t20200914_158714.htm>
{'biaoti': '中国国际人才交流基金会2019年度公开招聘工作人员拟聘人员公示(第二批)',
'laiyuan': '科技部',
'lianjie': 'http://www.most.gov.cn/tztg/202009/t20200914_158714.htm',
'shijian': '2020-09-14',
'xiangqing': '<div class="Custom_UnionStyle">\n'
'<p> '
'根据事业单位公开招聘工作有关规定,现将我单位2019年公开招聘编制内工作人员拟聘人员(第二批)予以公示。公示期间,如有问题,请向我单位反映,或直接通过中央和国家机关所属事业单位公开招聘服务平台反映。 '
'</p>\n'
'<div align="center">\n'
'<table style="BORDER-COLLAPSE: collapse" bordercolor="#333333" '
'cellspacing="0" cellpadding="0" border="1">\n'
'<tbody>\n'
'<tr>\n'
'<td width="166">\n'
'<p align="center"><span><strong>岗位</strong></span></p></td>\n'
'<td width="68">\n'
'<p align="center"><span><strong>姓名</strong></span></p></td>\n'
'<td width="155">\n'
'<p '
'align="center"><span><strong>学历学位及专业</strong></span></p></td>\n'
'<td width="186">\n'
'<p '
'align="center"><span><strong>原工作单位</strong></span></p></td></tr>\n'
'<tr>\n'
'<td width="166">\n'
'<p align="center"><span>培训与评价处</span><span>副处长</span></p></td>\n'
'<td width="68">\n'
'<p align="center"><span>梁才</span></p></td>\n'
'<td width="155">\n'
'<p '
'align="center"><span>博士研究生<br></span><span>电力系统及其自动化</span></p></td>\n'
'<td width="186">\n'
'<p '
'align="center"><span>国网能源研究院有限公司</span></p></td></tr></tbody></table></div>\n'
'<p> 公示时间:2020年9月11日-9月21日 (七个工作日) <br> 受理电话:010-58882735 '
'<br> 来信地址及邮政编码:北京市海淀区北蜂窝中路3号中国国际人才交流基金会综合处 100038</p>\n'
'<p> </p>\n'
'<p align="center"> 中国国际人才交流基金会<br> '
'2020年9月11日</p></div>'}
2020-09-15 11:18:32 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.most.gov.cn/tztg/202009/t20200911_158708.htm> (referer: http://www.most.gov.cn/tztg/index.htm)
2020-09-15 11:18:32 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.most.gov.cn/tztg/202009/t20200911_158708.htm>
{'biaoti': '2020年科技部直属事业单位公开招聘应届高校毕业生第二批拟聘用人员公示',
'laiyuan': '科技部',
'lianjie': 'http://www.most.gov.cn/tztg/202009/t20200911_158708.htm',
'shijian': '2020-09-11',
'wenjian': [{'file_name': '2020年科技部直属事业单位公开招聘应届高校毕业生第二批拟聘用人员名单',
'file_url': 'http://www.most.gov.cn/tztg/202009/W020200911442476560658.xls',
'new_file': '/2020/09/kS6NPncp_W020200911442476560658.xls'}],
'xiangqing': '<p> '
'根据《事业单位人事管理条例》和《事业单位公开招聘人员暂行规定》,按照公开、平等、竞争、择优的原则,科技部组织开展了2020年直属事业单位公开招聘应届高校毕业生工作。按照规定程序,科学技术部机关服务中心、中国科学技术发展战略研究院、中国科学技术交流中心、中国农村技术开发中心、科学技术部火炬高技术产业开发中心、中国生物技术发展中心、科学技术部科技经费监管服务中心、中国国际核聚变能源计划执行中心、国家科技风险开发事业中心、科学技术部科技人才交流开发服务中心、中国国际人才交流基金会确定了拟聘用人员名单,现对拟聘用人员进行公示(详见附件)。<br> '
'公示时间为2020年9月11日—21日。对公示信息如有异议,可以口头或书面形式向相关用人单位反映问题,反映者须署真实姓名,并提供必要的调查线索。</p>\n'
'<p> 附件:<a '
'href="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy/2020/09/kS6NPncp_W020200911442476560658.xls" '
'target="_blank" oldsrc="W020200911442476560658.xls" '
'_fcksavedurl="/webpic/W0202009/W020200911/W020200911442476560658.xls">2020年科技部直属事业单位公开招聘应届高校毕业生第二批拟聘用人员名单</a></p>\n'
'<p> </p>\n'
'<p align="center"> 科技部人事司<br> '
'2020年9月11日</p>'}
2020-09-15 11:18:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.most.gov.cn/tztg/202009/t20200909_158689.htm> (referer: http://www.most.gov.cn/tztg/index.htm)
2020-09-15 11:18:33 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.most.gov.cn/tztg/202009/t20200909_158689.htm>
{'biaoti': '中国科学技术发展战略研究院2020年面向社会公开招聘研究人员的公告',
'laiyuan': '科技部',
'lianjie': 'http://www.most.gov.cn/tztg/202009/t20200909_158689.htm',
'shijian': '2020-09-09',
'wenjian': [{'file_name': '中国科学技术发展战略研究院应聘报名登记表',
'file_url': 'http://www.most.gov.cn/tztg/202009/W020200909632058434997.doc',
'new_file': '/2020/09/oIhTDQlA_W020200909632058434997.doc'}],
'xiangqing': '中国科学技术发展战略研究院是科技部直属的综合性软科学研究机构,主要从事国家科学技术发展战略、政策、体制、管理、预测、评价以及科技促进经济社会发展等方面的研究,为国家科技、经济、社会发展的宏观决策提供咨询和建议。<br> '
'根据工作需要,现面向社会公开招聘2名事业编制研究人员。有关事宜通知如下。<br> 一、招聘基本条件<br> '
'(一)具有中华人民共和国国籍,遵守宪法和法律;<br> (二)政治思想素质好,遵纪守法,品行端正,无不良记录;<br> '
'(三)具有博士研究生学历和学位,具备应聘岗位所要求的工作能力及岗位所需的其它条件;<br> '
'(四)良好的沟通能力和团队合作精神;<br> (五)身体健康,能正常开展工作;<br> (六)具有北京市户口;<br> '
'(七)年龄40周岁以下(1980年7月1日以后出生)。<br> 二、岗位、人数及要求 \n'
'<table style="BORDER-COLLAPSE: collapse" bordercolor="#333333" '
'cellspacing="0" cellpadding="0" align="center" border="1">\n'
'<tbody>\n'
'<tr>\n'
'<td width="71">\n'
'<p align="center"><span><strong>编号</strong></span></p></td>\n'
'<td width="124">\n'
'<p align="center"><span><strong>岗位名称</strong></span></p></td>\n'
'<td width="54">\n'
'<p '
'align="center"><span><strong>招聘<br>人数</strong></span></p></td>\n'
'<td width="126">\n'
'<p align="center"><span><strong>专业</strong></span></p></td>\n'
'<td width="213">\n'
'<p '
'align="center"><span><strong>其他条件</strong></span></p></td></tr>\n'
'<tr>\n'
'<td valign="top" width="71">\n'
'<p align="center"><span>2020001</span></p></td>\n'
'<td valign="top" width="124">\n'
'<p align="justify"><span>综合发展研究岗</span></p></td>\n'
'<td valign="top" width="54">\n'
'<p align="center"><span>1</span></p></td>\n'
'<td valign="top" width="126">\n'
'<p align="justify"><span>经济学、金融学、国际贸易等相关专业</span></p></td>\n'
'<td valign="top" width="213">\n'
'<p '
'align="justify"><span>具有良好的团队合作精神,理论基础和文字功底扎实,有相关工作经验</span></p></td></tr>\n'
'<tr>\n'
'<td valign="top" width="71">\n'
'<p align="center"><span>2020002</span></p></td>\n'
'<td valign="top" width="124">\n'
'<p align="justify"><span>科技与社会发展研究岗</span></p></td>\n'
'<td valign="top" width="54">\n'
'<p align="center"><span>1</span></p></td>\n'
'<td valign="top" width="126">\n'
'<p><span>社会学、统计学、管理学等相关专业</span></p></td>\n'
'<td valign="top" width="213">\n'
'<p '
'align="justify"><span>具有良好的团队合作精神,理论基础和文字功底扎实,有相关工作经验</span></p></td></tr></tbody></table>\n'
'<p> 三、招聘程序<br> (一)报名<br> '
'应聘人员须填写《中国科学技术发展战略研究院应聘报名登记表》(详见附件),在2020年9月14日前将《应聘报名登记表》及相关证明材料的电子版(扫描件或照片)发至指定邮箱zhb@casted.org.cn。<br> '
'(二)资格审查<br> '
'根据招聘岗位条件对应聘人员资格审查,对于符合条件者,以短信或电子邮件的方式通知应聘者参加考试,同时在中国科学技术发展战略研究院网站(http://www.casted.org.cn)公布考试人员名单。请应聘者在笔试时携带身份证、毕业证、学位证,以及相关证明材料原件,以备核查。<br> '
'(三)考试<br> '
'采取笔试、面试相结合的方式进行。笔试内容为专业能力测试,面试内容为综合性考察。根据笔试成绩排序,成绩不低于60分方有资格进入面试;按1:5的比例参加面试,不足1:5的,按实际参加人数确定。考试其他事项另行通知。<br> '
'(四)考察、体检<br> '
'根据考试、面试综合成绩(笔试成绩占50%,面试成绩占50%)按1:1比例确定考察、体检人选,体检参照《公务员录用体检通用标准》等规定执行。如放弃资格或考察、体检不合格,按综合成绩顺次递补。<br> '
'(五)公示<br> '
'根据考试、体检和考察结果,确定拟聘人员,在中央和国家机关所属事业单位公开招聘服务平台、科学技术部、中国科学技术发展战略研究院网站公示。公示期为7个工作日。<br> '
'(六)聘用<br> '
'经公示无异议后,办理聘用手续,签定聘用合同。被聘用人员实行试用期(试用期2个月,试用期包括在聘用合同期限内),试用期满考核合格的,予以正式聘用;不合格的,解除聘用关系。聘用人员享受国家规定的工资福利待遇。<br> '
'特此公告。<br> 联系电话:58884679 <br> 电子邮箱:zhb@casted.org.cn。<br> '
'传真电话:010-58884678<br> 附件:<a '
'href="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy/2020/09/oIhTDQlA_W020200909632058434997.doc" '
'target="_blank" oldsrc="W020200909632058434997.doc" '
'_fcksavedurl="/webpic/W0202009/W020200909/W020200909632058434997.doc">中国科学技术发展战略研究院应聘报名登记表</a></p>\n'
'<p align="center"> '
'中国科学技术发展战略研究院<br> 2020年9月9日</p>\n'
'<p> </p>\n'
'<div> </div>'}
2020-09-15 11:18:34 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.most.gov.cn/tztg/202009/t20200904_158652.htm> (referer: http://www.most.gov.cn/tztg/index.htm)
2020-09-15 11:18:35 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.most.gov.cn/tztg/202009/t20200904_158652.htm>
{'biaoti': '关于对“科技冬奥”重点专项2021年度项目申报指南征求意见的通知',
'laiyuan': '科技部',
'lianjie': 'http://www.most.gov.cn/tztg/202009/t20200904_158652.htm',
'shijian': '2020-09-04',
'wenjian': [{'file_name': '“科技冬奥”重点专项2021年度项目申报指南(征求意见稿)',
'file_url': 'http://www.most.gov.cn/tztg/202009/W020200904602892347374.doc',
'new_file': '/2020/09/C6fFHgSq_W020200904602892347374.doc'}],
'xiangqing': '<p> '
'根据《国务院关于改进加强中央财政科研项目和资金管理的若干意见》(国发〔2014〕11号)、《国务院关于深化中央财政科技计划(专项、基金等)管理改革方案的通知》(国发〔2014〕64号)、《科技部 '
'财政部关于印发&lt;国家重点研发计划管理暂行办法>的通知》(国科发资〔2017〕152号)等文件要求,现将“科技冬奥”重点专项2021年度项目申报指南(见附件)向社会征求意见和建议。征求意见时间为2020年9月4日至2020年9月18日,修改意见请于9月18日24点之前发至电子邮箱。<br> '
'国家重点研发计划相关重点专项的凝练布局和任务部署已经战略咨询与综合评审特邀委员会咨询评议,国家科技计划管理部际联席会议研究审议,并报国务院批准实施。本次征求意见重点针对各专项指南方向提出的目标指标和相关内容的合理性、科学性、先进性等方面听取各方意见和建议。科技部将会同有关部门、专业机构和专家,认真研究收到的意见和建议,修改完善相关重点专项的项目申报指南。征集到的意见和建议,将不再反馈和回复。<br> '
'联系方式:sfs_zhc@most.cn</p>\n'
'<p> 附件:<a '
'href="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy/2020/09/C6fFHgSq_W020200904602892347374.doc" '
'target="_blank" oldsrc="W020200904602892347374.doc" '
'_fcksavedurl="/webpic/W0202009/W020200904/W020200904602892347374.doc">“科技冬奥”重点专项2021年度项目申报指南(征求意见稿)</a></p>\n'
'<p align="center"><br> '
'科技部社会发展科技司<br> 2020年9月4日</p>'}
2020-09-15 11:18:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.most.gov.cn/tztg/202009/t20200903_158644.htm> (referer: http://www.most.gov.cn/tztg/index.htm)
2020-09-15 11:18:36 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.most.gov.cn/tztg/202009/t20200903_158644.htm>
{'biaoti': '国家科技基础条件平台中心面向社会公开招聘工作人员公告',
'laiyuan': '科技部',
'lianjie': 'http://www.most.gov.cn/tztg/202009/t20200903_158644.htm',
'shijian': '2020-09-03',
'wenjian': [{'file_name': '国家科技基础条件平台中心公开招聘人员报名表',
'file_url': 'http://www.most.gov.cn/tztg/202009/W020200903611739213756.doc',
'new_file': '/2020/09/fTeew6hw_W020200903611739213756.doc'}],
'xiangqing': '<p> '
'国家科技基础条件平台中心(简称平台中心)是科学技术部直属事业单位,致力于推动科技资源优化配置,实现开放共享。根据《事业单位公开招聘人员暂行规定》和《科技部事业单位人事管理办法》等有关规定,按照“公开、公平、公正、竞争、择优”的原则,面向社会公开招聘一名工作人员。现将有关事宜通知如下: '
'</p>\n'
'<p> <strong>一、招聘岗位</strong> </p>\n'
'<p> 综合与监督处财务会计岗1名。 </p>\n'
'<p> <strong>二、基本条件及岗位要求</strong> </p>\n'
'<p> (一)基本条件 </p>\n'
'<p> 1. 具有中华人民共和国国籍,遵守宪法和法律,拥护中国共产党领导,品行端正,有强烈的事业心和责任心; </p>\n'
'<p> 2. 中共党员; </p>\n'
'<p> 3. 会计类专业,大学本科及以上学历和学位。 </p>\n'
'<p> 4. 具有较强的协作配合及团队精神,具有较强的综合协调、组织管理和文字表达能力; </p>\n'
'<p> 5. 北京市户口,身体健康,年龄一般在35周岁以下(1985年1月1日以后出生); </p>\n'
'<p> 6. 3年以上工作经验,有科研项目财务管理经验者优先。 </p>\n'
'<p> (二)岗位职责 </p>\n'
'<p> '
'承担中心各项会计核算工作;承担中心预决算编制工作,审核各项经费预算,监督各项预算的执行情况。承担编制报送会计报表,管理往来帐目,保管会计档案工作等。 '
'</p>\n'
'<p> <strong>三、招聘程序</strong> </p>\n'
'<p> '
'1.自愿报名。应聘人员须填写《国家科技基础条件平台中心公开招聘人员报名表》(见附件),同时须提交个人简历、学历学位证书、身份证、职称及相关资格证书和获奖证书等扫描件电子版,于2020年9月14日17:00前,发电子邮件至平台中心。 '
'</p>\n'
'<p> 2.资格审查。对应聘人员的资格条件进行审查,确定参加笔试的人员名单。 </p>\n'
'<p> '
'3.考试。考试分笔试与面试两部分。笔试主要测试应聘者综合分析能力、业务水平、文字表达能力,面试主要测试应聘者的沟通协调和岗位适应能力。对于笔试成绩合格者(60分以上),按1:5的比例组织面试。不足1:5的,按实际进入面试人数组织。笔试、面试时间地点及人员名单将在科技部网站及平台中心网站上公布。 '
'</p>\n'
'<p> '
'4.考察。根据综合成绩(笔试成绩占40%,面试成绩占60%)按照1:1比例确定考察人选,平台中心将对其政治表现、道德品质、业务能力、拟任岗位资格等进行调研和考察。 '
'</p>\n'
'<p> '
'5.体检。通知被考察人员到指定医疗机构进行身体检查。体检标准参照《公务员录用体检通用标准(试行)》(2010年修订)执行。 '
'</p>\n'
'<p> '
'6.公示。根据考试成绩、考察情况、体检结果,确定拟聘用人员名单,并对拟聘用人员在中央和国家机关公开招聘服务平台、科技部网站和平台中心网站上进行公示,公示期为7个工作日。 '
'</p>\n'
'<p> 7.签订聘用合同,办理聘用手续等。本次招聘人员试用期为6个月,适用期满考核合格的,予以正式聘用;不合格的,取消聘用。 '
'</p>\n'
'<p> <strong>四、注意事项</strong> </p>\n'
'<p> 1.请应聘人员按时限要求填报《招聘人员报名表》,过期不予受理,恕不接待来访。 </p>\n'
'<p> 2.请应聘人员关注并及时查阅科技部和平台中心网站相关信息,对各环节中未通过的应聘人员将不再单独通知。 </p>\n'
'<p> 3.应聘人员在应聘工作过程中所发生的一切费用自理。 </p>\n'
'<p> <strong>五、联系方式</strong> </p>\n'
'<p> 联系电话:(010)58881106,58881469 </p>\n'
'<p> 电子邮件:pingtai@most.cn </p>\n'
'<p> </p>\n'
'<p> 附件:<a '
'href="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy/2020/09/fTeew6hw_W020200903611739213756.doc" '
'target="_blank" _fcksavedurl="C:\\fakepath\\平台中心招聘人员报名表.doc" '
'oldsrc="W020200903611739213756.doc">国家科技基础条件平台中心公开招聘人员报名表</a> '
'</p>\n'
'<p> </p>\n'
'<p align="center"> 国家科技基础条件平台中心 <br> '
'2020年9月3日</p>'}
2020-09-15 11:18:36 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:18:36 [root] INFO: 爬虫运行完毕了
2020-09-15 11:18:36 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1728,
'downloader/request_count': 6,
'downloader/request_method_count/GET': 6,
'downloader/response_bytes': 54032,
'downloader/response_count': 6,
'downloader/response_status_count/200': 6,
'elapsed_time_seconds': 7.019067,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 18, 36, 328563),
'item_scraped_count': 5,
'log_count/DEBUG': 11,
'log_count/INFO': 26,
'request_depth_max': 1,
'response_received_count': 6,
'scheduler/dequeued': 6,
'scheduler/dequeued/memory': 6,
'scheduler/enqueued': 6,
'scheduler/enqueued/memory': 6,
'start_time': datetime.datetime(2020, 9, 15, 3, 18, 29, 309496)}
2020-09-15 11:18:36 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:15 [scrapy.extensions.telnet] INFO: Telnet Password: 091df706957f783b
2020-09-16 08:47:15 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:15 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:15 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:15 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.kexujishubuPipeline']
2020-09-16 08:47:15 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:15 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:15 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-16 08:47:15 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'CONCURRENT_REQUESTS': 3,
'DOWNLOAD_DELAY': 8,
'LOG_FILE': 'logs/chacewang_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

1612
demo1/logs/qicetong_2020_9.log

File diff suppressed because it is too large

104
demo1/logs/sxfagaiwei_2020_9.log

@ -0,0 +1,104 @@
2020-09-15 11:25:51 [scrapy.extensions.telnet] INFO: Telnet Password: a2570cb03adf56d2
2020-09-15 11:25:51 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:25:51 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:25:51 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:25:52 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:25:52 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:25:52 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:25:52 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:25:53 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://fgw.shanxi.gov.cn/xmsb/> (referer: None)
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/wzzs/202006/t20200604_124281.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/jjmy/202003/t20200320_123242.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/jgtz/202003/t20200309_123080.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/gdzctz/202003/t20200305_123010.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/wzzs/202003/t20200302_122952.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/gdzctz/202002/t20200228_122934.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/kjws/202002/t20200221_122773.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/hjyzy/202002/t20200213_122703.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/kjws/202002/t20200212_122695.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/gdzctz/202002/t20200206_122635.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/kjws/202002/t20200205_122612.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/kjws/201912/t20191203_121756.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/kjws/201911/t20191126_121615.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/gdzctz/201911/t20191126_121614.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/kjws/201911/t20191118_121510.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/ncjj/201911/t20191112_121439.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/gdzctz/201911/t20191105_121319.shtml
2020-09-15 11:25:53 [root] INFO: 这个链接已经爬过了-----:http://fgw.shanxi.gov.cn/fggz/wngz/gdzctz/201911/t20191105_121301.shtml
2020-09-15 11:25:53 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:25:53 [root] INFO: 爬虫运行完毕了
2020-09-15 11:25:53 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 226,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 51247,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 1.543148,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 25, 53, 570449),
'log_count/DEBUG': 1,
'log_count/INFO': 29,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 9, 15, 3, 25, 52, 27301)}
2020-09-15 11:25:53 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet Password: 67ddf8a79c0e9956
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:17 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:17 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:17 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-16 08:47:17 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:17 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:17 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6036
2020-09-16 08:47:17 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/taiyuankjj_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

101
demo1/logs/sxgongxinting_2020_9.log

@ -0,0 +1,101 @@
2020-09-15 11:24:04 [scrapy.extensions.telnet] INFO: Telnet Password: 9f00917b7af7e812
2020-09-15 11:24:04 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:24:04 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:24:04 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:24:04 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:24:04 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:24:04 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:24:04 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:24:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://gxt.shanxi.gov.cn/web/cateList.html?id=26&word=&pageIndex=1> (referer: None)
2020-09-15 11:24:04 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1511
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1500
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1479
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1474
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1473
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1462
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1388
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1310
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1243
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1182
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1195
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1197
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1219
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1225
2020-09-15 11:24:05 [root] INFO: 这个链接已经爬过了-----:http://gxt.shanxi.gov.cn/web/content.html?id=1063
2020-09-15 11:24:05 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:24:05 [root] INFO: 爬虫运行完毕了
2020-09-15 11:24:05 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 262,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 8294,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.451931,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 24, 5, 224221),
'log_count/DEBUG': 1,
'log_count/INFO': 26,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 9, 15, 3, 24, 4, 772290)}
2020-09-15 11:24:05 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet Password: 60d158322cf5c039
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-16 08:47:16 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:16 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6034
2020-09-16 08:47:16 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/sxzonggaishifanqu_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

106
demo1/logs/sxkejiting_2020_9.log

@ -0,0 +1,106 @@
2020-09-15 11:23:21 [scrapy.extensions.telnet] INFO: Telnet Password: c1d9248c34ad74eb
2020-09-15 11:23:21 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:23:21 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:23:21 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:23:21 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:23:21 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:23:21 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:23:21 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:23:21 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://kjt.shanxi.gov.cn/tzgg/index.jhtml> (referer: None)
2020-09-15 11:23:21 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/tcc/49699.jhtml
2020-09-15 11:23:21 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/sfc/49633.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/sfc/49632.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/gxc/50209.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/gxc/50208.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/jcyjc/50207.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/nckjc/50201.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/gxc/50116.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/wzj/50101.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/jcyjc/50100.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/sfc/50090.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/zlghc/50085.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/fzjhc/50080.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/jcyjc/50077.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/wzj/50076.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/gjhzc/50069.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/gjhzc/50068.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/nckjc/50067.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/nckjc/50066.jhtml
2020-09-15 11:23:22 [root] INFO: 这个链接已经爬过了-----:http://kjt.shanxi.gov.cn:80/wzj/50063.jhtml
2020-09-15 11:23:22 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:23:22 [root] INFO: 爬虫运行完毕了
2020-09-15 11:23:22 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 237,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 39035,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.740745,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 23, 22, 360149),
'log_count/DEBUG': 1,
'log_count/INFO': 31,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 9, 15, 3, 23, 21, 619404)}
2020-09-15 11:23:22 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet Password: 39a7d349a785725b
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-16 08:47:16 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:16 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6032
2020-09-16 08:47:16 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/sxshangwuting_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

110
demo1/logs/sxshangwuting_2020_9.log

@ -0,0 +1,110 @@
2020-09-15 11:23:34 [scrapy.extensions.telnet] INFO: Telnet Password: ee22c12439cb5178
2020-09-15 11:23:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:23:34 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:23:34 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:23:34 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:23:34 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:23:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:23:34 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:23:34 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://swt.shanxi.gov.cn/Main/list.action?channelId=27> (referer: None)
2020-09-15 11:23:34 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=4cb2c090-e719-41d0-ac0f-1abe541f183e
2020-09-15 11:23:34 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=43efe7bb-0a96-4484-b9f4-9184f35b94e8
2020-09-15 11:23:34 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=20355e00-5299-4693-b784-3ea132f68e12
2020-09-15 11:23:34 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=9daf0688-0f5d-467c-8531-ba1cefc92770
2020-09-15 11:23:34 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=0238484c-8496-4066-8996-3de03378979c
2020-09-15 11:23:34 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'fgw.shanxi.gov.cn': <GET http://fgw.shanxi.gov.cn/fggz/wngz/kjws/201911/t20191129_121660.shtml>
2020-09-15 11:23:34 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=e8944693-fe8b-4385-be73-4aa7715056f1
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=b913adc3-775d-4c3c-9ef0-ccb66eb6987f
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=318e14b2-ca25-4e91-b6b0-2b54a1f88348
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=923c9f58-34a3-4518-853c-b86f33787ebc
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=dff5d8f1-a830-44f2-ba68-3e2af3c52638
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=d0a6ba2d-952b-4d93-8663-ae9a4008ae0a
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=8be236d6-5365-44ef-990f-a6848a860346
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=f9a6ad01-6902-495a-84e4-6500c5e8f3cc
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=9fbb7bad-1119-4be7-b6df-9ecf2feb34f3
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=50e277e2-9d8f-499e-816f-aea870f89c89
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=4ee60e63-acca-4c86-8d9c-099f7bd3aa4f
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=c40c816b-a596-4f9f-94ac-1fe6154a7cf3
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=efa92a7b-16d3-496c-b07f-5a63525bafe1
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=31eb36b4-f197-4c3b-9162-2f332b050ced
2020-09-15 11:23:35 [root] INFO: 这个链接已经爬过了-----:http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=33bb2acd-de5d-442a-859f-2e9d95f73504
2020-09-15 11:23:35 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:23:35 [root] INFO: 爬虫运行完毕了
2020-09-15 11:23:35 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 250,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 8192,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.765148,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 23, 35, 245648),
'log_count/DEBUG': 2,
'log_count/INFO': 31,
'offsite/domains': 1,
'offsite/filtered': 1,
'request_depth_max': 1,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 9, 15, 3, 23, 34, 480500)}
2020-09-15 11:23:35 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet Password: 1a617e64c04cecf7
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-16 08:47:16 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:16 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6033
2020-09-16 08:47:16 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/sxgongxinting_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

1305
demo1/logs/sxzonggaishifanqu_2020_9.log

File diff suppressed because it is too large

102
demo1/logs/taiyuangongyehexinxihuaju_2020_9.log

@ -0,0 +1,102 @@
2020-09-15 11:26:36 [scrapy.extensions.telnet] INFO: Telnet Password: c6e85d4b0dcad085
2020-09-15 11:26:36 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:26:36 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:26:36 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:26:36 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:26:36 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:26:36 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:26:36 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:26:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://jxw.taiyuan.gov.cn/zfxxgk/gggs/index.shtml> (referer: None)
2020-09-15 11:26:37 [root] INFO: 这个链接已经爬过了-----:http://jxw.taiyuan.gov.cn/doc/2020/09/09/1009086.shtml
2020-09-15 11:26:37 [root] INFO: 这个链接已经爬过了-----:http://jxw.taiyuan.gov.cn/doc/2020/09/08/1009024.shtml
2020-09-15 11:26:37 [root] INFO: 这个链接已经爬过了-----:http://jxw.taiyuan.gov.cn/doc/2020/09/08/1009023.shtml
2020-09-15 11:26:37 [root] INFO: 这个链接已经爬过了-----:http://jxw.taiyuan.gov.cn/doc/2020/07/29/996943.shtml
2020-09-15 11:26:37 [root] INFO: 这个链接已经爬过了-----:http://jxw.taiyuan.gov.cn/doc/2020/07/14/992583.shtml
2020-09-15 11:26:37 [root] INFO: 这个链接已经爬过了-----:http://jxw.taiyuan.gov.cn/doc/2020/07/07/991266.shtml
2020-09-15 11:26:37 [root] INFO: 这个链接已经爬过了-----:http://jxw.taiyuan.gov.cn/doc/2020/05/21/979674.shtml
2020-09-15 11:26:37 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'mp.weixin.qq.com': <GET https://mp.weixin.qq.com/s/mcyJT6mCdES9_El1DeTfcg>
2020-09-15 11:26:37 [root] INFO: 这个链接已经爬过了-----:http://jxw.taiyuan.gov.cn/doc/2020/05/20/979447.shtml
2020-09-15 11:26:37 [root] INFO: 这个链接已经爬过了-----:http://jxw.taiyuan.gov.cn/doc/2020/05/19/979251.shtml
2020-09-15 11:26:37 [root] INFO: 这个链接已经爬过了-----:http://jxw.taiyuan.gov.cn/doc/2020/04/20/970538.shtml
2020-09-15 11:26:37 [root] INFO: 这个链接已经爬过了-----:http://jxw.taiyuan.gov.cn/doc/2020/03/27/965251.shtml
2020-09-15 11:26:37 [root] INFO: 这个链接已经爬过了-----:http://jxw.taiyuan.gov.cn/doc/2020/03/27/965256.shtml
2020-09-15 11:26:37 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:26:37 [root] INFO: 爬虫运行完毕了
2020-09-15 11:26:37 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 245,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 17676,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.612552,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 26, 37, 303151),
'log_count/DEBUG': 2,
'log_count/INFO': 23,
'offsite/domains': 1,
'offsite/filtered': 3,
'request_depth_max': 1,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 9, 15, 3, 26, 36, 690599)}
2020-09-15 11:26:37 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:17 [scrapy.extensions.telnet] INFO: Telnet Password: ff33c7fb5df2dc19
2020-09-16 08:47:17 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:17 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:17 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:17 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-16 08:47:17 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:17 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:17 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6038
2020-09-16 08:47:17 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/taiyuanshangwuju_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

203
demo1/logs/taiyuankjj_2020_9.log

@ -0,0 +1,203 @@
2020-09-15 11:26:10 [scrapy.extensions.telnet] INFO: Telnet Password: 423034b8342a486e
2020-09-15 11:26:10 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:26:11 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:26:11 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:26:11 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:26:11 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:26:11 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:26:11 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:26:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://kjj.taiyuan.gov.cn/zfxxgk/gggs/index.shtml> (referer: None)
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/09/07/1008391.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/09/04/1008199.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/08/21/1004590.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/08/13/1001630.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/08/08/999926.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/07/31/997727.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/07/17/993580.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/06/23/988275.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/06/22/988019.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/06/19/987592.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/06/15/986244.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/06/15/986238.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/06/15/986237.shtml
2020-09-15 11:26:11 [root] INFO: 这个链接已经爬过了-----:http://kjj.taiyuan.gov.cn/doc/2020/06/15/986236.shtml
2020-09-15 11:26:12 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://kjj.taiyuan.gov.cn/doc/2020/09/12/1010113.shtml> (referer: http://kjj.taiyuan.gov.cn/zfxxgk/gggs/index.shtml)
2020-09-15 11:26:12 [scrapy.core.scraper] DEBUG: Scraped from <200 http://kjj.taiyuan.gov.cn/doc/2020/09/12/1010113.shtml>
{'biaoti': '关于征求太原市地方标准《科技成果评价规范(征求意见稿)》意见的通知',
'laiyuan': '太原市科学技术局',
'lianjie': 'http://kjj.taiyuan.gov.cn/doc/2020/09/12/1010113.shtml',
'shijian': '2020-09-12',
'wenjian': [{'file_name': '1.科技成果评价规范(征求意见稿).doc',
'file_url': 'http://kjj.taiyuan.gov.cn/uploadfiles/202009/12/2020091222053429459132.doc',
'new_file': '/2020/09/Yys4ES6z_2020091222053429459132.doc'},
{'file_name': '2.地方标准征求意见反馈表.doc',
'file_url': 'http://kjj.taiyuan.gov.cn/uploadfiles/202009/12/2020091221401014098186.doc',
'new_file': '/2020/09/ucvansUw_2020091221401014098186.doc'}],
'xiangqing': '<div id="Zoom"> \n'
' <!--<$[CONTENT]>start-->\n'
' <!--<p style="text-align:center;"><img src="" '
'/></p>-->\n'
' <p></p><p align="justify" style="text-align: justify; '
'line-height: 200%; text-indent: 0pt; -ms-text-autospace: '
'ideograph-numeric; -ms-text-justify: inter-ideograph;"><span '
'style="font-size: 11pt;"><span style="font-family: '
'宋体;">各相关单位</span></span><span style="font-size: 11pt;"><span '
'style="font-family: 宋体;">和个人</span></span><span '
'style="font-size: 11pt;"><span style="font-family: '
'宋体;">:</span></span></p>\n'
'\n'
'<p align="justify" style="text-align: justify; line-height: '
'200%; text-indent: 22pt; -ms-text-autospace: ideograph-numeric; '
'-ms-text-justify: inter-ideograph;"><span style="font-size: '
'11pt;"><span style="font-family: '
'宋体;">根据国家《地方标准管理办法》要求,现就太原市科学技术局提出,太原技术转移促进中心、山西产业互联网研究院、山西省大众科技评估中心起草的地方标准《科技成果评价规范(征求意见稿)》,向社会公开征求意见,请各有关单位及个人提出意见,并填写《征求意见反馈表》,于2020年10月11日前反馈至市科技局计划处。</span></span></p>\n'
'\n'
'<p align="justify" style="text-align: justify; line-height: '
'200%; text-indent: 22pt; -ms-text-autospace: ideograph-numeric; '
'-ms-text-justify: inter-ideograph;"><span style="font-size: '
'11pt;"><span style="font-family: 宋体;">联 系 人:</span></span><span '
'style="font-size: 11pt;"><span style="font-family: '
'宋体;">张晓军</span></span></p>\n'
'\n'
'<p align="justify" style="text-align: justify; line-height: '
'200%; text-indent: 22pt; -ms-text-autospace: ideograph-numeric; '
'-ms-text-justify: inter-ideograph;"><span style="font-size: '
'11pt;"><span style="font-family: 宋体;">联系电话:</span></span><span '
'style="font-size: 11pt;"><span style="font-family: '
'宋体;">4223750</span></span></p>\n'
'\n'
'<p align="justify" style="text-align: justify; line-height: '
'200%; text-indent: 22pt; -ms-text-autospace: ideograph-numeric; '
'-ms-text-justify: inter-ideograph;"><span style="font-size: '
'11pt;"><span style="font-family: 宋体;">电子邮箱:</span></span><span '
'style="font-size: 11pt;"><span style="font-family: '
'宋体;">cxfz701</span></span><span style="font-size: 11pt;"><span '
'style="font-family: 宋体;">@1</span></span><span '
'style="font-size: 11pt;"><span style="font-family: '
'宋体;">63</span></span><span style="font-size: 11pt;"><span '
'style="font-family: 宋体;">.com</span></span></p>\n'
'\n'
'<p align="justify" style="text-align: justify; line-height: '
'200%; text-indent: 22pt; -ms-text-autospace: ideograph-numeric; '
'-ms-text-justify: inter-ideograph;">\xa0</p>\n'
'\n'
'<p align="justify" style="text-align: justify; line-height: '
'200%; text-indent: 22pt; -ms-text-autospace: ideograph-numeric; '
'-ms-text-justify: inter-ideograph;"><span style="font-size: '
'11pt;"><span style="font-family: '
'宋体;">附</span></span>\xa0\xa0\xa0\xa0<span style="font-size: '
'11pt;"><span style="font-family: 宋体;">件:</span></span></p>\n'
'\n'
'<p align="justify" style="text-align: justify; line-height: '
'200%; text-indent: 22pt; -ms-text-autospace: ideograph-numeric; '
'-ms-text-justify: inter-ideograph;"><a '
'href="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy/2020/09/Yys4ES6z_2020091222053429459132.doc" '
'target="_blank" '
'title="1.科技成果评价规范(征求意见稿).doc">1.科技成果评价规范(征求意见稿).doc</a></p>\n'
'\n'
'<p align="justify" style="text-align: justify; line-height: '
'200%; text-indent: 22pt; -ms-text-autospace: ideograph-numeric; '
'-ms-text-justify: inter-ideograph;"><a '
'href="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy/2020/09/ucvansUw_2020091221401014098186.doc" '
'target="_blank" '
'title="2.地方标准征求意见反馈表.doc">2.地方标准征求意见反馈表.doc</a></p>\n'
'\n'
'<p align="justify" style="text-align: justify; line-height: '
'200%; text-indent: 0pt; -ms-text-autospace: ideograph-numeric; '
'-ms-text-justify: inter-ideograph;">\xa0</p>\n'
'\n'
'<p align="justify" style="text-align: justify; line-height: '
'200%; text-indent: 0pt; -ms-text-autospace: ideograph-numeric; '
'-ms-text-justify: inter-ideograph;">\xa0</p>\n'
'\n'
'<p align="right" style="text-align: right; line-height: 200%; '
'text-indent: 0pt; -ms-text-autospace: ideograph-numeric;"><span '
'style="font-size: 11pt;"><span style="font-family: '
'宋体;">太原市科学技术局</span></span></p>\n'
'\n'
'<p align="right" style="text-align: right; line-height: 200%; '
'text-indent: 0pt; -ms-text-autospace: ideograph-numeric;"><span '
'style="font-size: 11pt;"><span style="font-family: '
'宋体;">2020年9月12日</span></span></p>\n'
'\n'
' <!--<$[CONTENT]>end--> \n'
' </div>'}
2020-09-15 11:26:12 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:26:12 [root] INFO: 爬虫运行完毕了
2020-09-15 11:26:12 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 555,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 33217,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'elapsed_time_seconds': 1.491522,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 26, 12, 594548),
'item_scraped_count': 1,
'log_count/DEBUG': 3,
'log_count/INFO': 25,
'request_depth_max': 1,
'response_received_count': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2020, 9, 15, 3, 26, 11, 103026)}
2020-09-15 11:26:12 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:17 [scrapy.extensions.telnet] INFO: Telnet Password: d2a8a3ac7c4697ab
2020-09-16 08:47:17 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:17 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:17 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:17 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-16 08:47:17 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:17 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:17 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6037
2020-09-16 08:47:17 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/taiyuangongyehexinxihuaju_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

1985
demo1/logs/taiyuanshangwuju_2020_9.log

File diff suppressed because it is too large

106
demo1/logs/wenhuahelvyoubu_2020_9.log

@ -0,0 +1,106 @@
2020-09-15 11:22:21 [scrapy.extensions.telnet] INFO: Telnet Password: d58f88db3f55832a
2020-09-15 11:22:21 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:22:21 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:22:21 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:22:21 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:22:21 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:22:21 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:22:21 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:22:22 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.mct.gov.cn/whzx/ggtz/index.htm> (referer: None)
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202009/t20200908_874960.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202009/t20200907_874843.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202009/t20200907_874793.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202009/t20200901_874605.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202008/t20200831_874550.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202008/t20200831_874501.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202008/t20200831_874504.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202008/t20200824_874310.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/ceshi/gztz/202008/t20200821_874282.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202008/t20200819_874226.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202008/t20200819_874227.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202008/t20200814_874141.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:https://www.mct.gov.cn/whzx/ggtz/202008/t20200813_874115.htm
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202008/t20200812_874083.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202007/t20200729_873774.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202007/t20200728_873742.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202007/t20200728_873743.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202007/t20200727_873716.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202007/t20200717_873583.html?keywords=
2020-09-15 11:22:22 [root] INFO: 这个链接已经爬过了-----:http://zwgk.mct.gov.cn/auto255/202007/t20200717_873581.html?keywords=
2020-09-15 11:22:22 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:22:22 [root] INFO: 爬虫运行完毕了
2020-09-15 11:22:22 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 237,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 17265,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.825351,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 22, 22, 746644),
'log_count/DEBUG': 1,
'log_count/INFO': 31,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 9, 15, 3, 22, 21, 921293)}
2020-09-15 11:22:22 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet Password: 566787543480039e
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-16 08:47:16 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:16 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6029
2020-09-16 08:47:16 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/zhongxiaoqiyezongju_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

110
demo1/logs/zhongxiaoqiyezongju_2020_9.log

@ -0,0 +1,110 @@
2020-09-15 11:22:50 [scrapy.extensions.telnet] INFO: Telnet Password: 2241718bb5310557
2020-09-15 11:22:50 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:22:51 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:22:51 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:22:51 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:22:51 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:22:51 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:22:51 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:22:51 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/index.html> (referer: None)
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c8069941/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c8041166/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c8025977/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c8025439/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7998949/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7998890/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7941210/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7941178/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7941146/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7917147/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7897089/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7869105/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7858580/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7838720/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7827070/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7826412/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7683073/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7669471/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7572659/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7557666/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7473474/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7452903/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c7452084/content.html
2020-09-15 11:22:51 [root] INFO: 这个链接已经爬过了-----:http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/c6999390/content.html
2020-09-15 11:22:51 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:22:51 [root] INFO: 爬虫运行完毕了
2020-09-15 11:22:51 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 274,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 28424,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.597737,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 22, 51, 669452),
'log_count/DEBUG': 1,
'log_count/INFO': 35,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 9, 15, 3, 22, 51, 71715)}
2020-09-15 11:22:51 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet Password: 58d863091de49bb3
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-16 08:47:16 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:16 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6030
2020-09-16 08:47:16 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/fazhancujinju_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

142
demo1/logs/ziranweiyuanhui_2020_9.log

@ -0,0 +1,142 @@
2020-09-15 11:19:58 [scrapy.extensions.telnet] INFO: Telnet Password: 0740a21bdbade713
2020-09-15 11:19:58 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:19:59 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:19:59 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:19:59 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:19:59 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:19:59 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:19:59 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:19:59 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/huojuzhongxin_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}
2020-09-15 11:21:14 [scrapy.extensions.telnet] INFO: Telnet Password: ec33d8ae3d9048d8
2020-09-15 11:21:14 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-15 11:21:14 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-15 11:21:14 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-15 11:21:14 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-15 11:21:14 [scrapy.core.engine] INFO: Spider opened
2020-09-15 11:21:14 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-15 11:21:14 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-15 11:21:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.nsfc.gov.cn/publish/portal0/tab442/module1178/page1.htm> (referer: None)
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78356.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78152.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78573.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78571.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78569.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78567.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78537.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78522.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78519.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78513.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78512.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78478.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78477.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78463.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78460.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78459.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78457.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78444.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78437.htm
2020-09-15 11:21:15 [root] INFO: 这个链接已经爬过了-----:http://www.nsfc.gov.cn/publish/portal0/tab442/info78435.htm
2020-09-15 11:21:15 [scrapy.core.engine] INFO: Closing spider (finished)
2020-09-15 11:21:15 [root] INFO: 爬虫运行完毕了
2020-09-15 11:21:15 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 262,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 6981,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.49302,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 15, 3, 21, 15, 413794),
'log_count/DEBUG': 1,
'log_count/INFO': 31,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 9, 15, 3, 21, 14, 920774)}
2020-09-15 11:21:15 [scrapy.core.engine] INFO: Spider closed (finished)
2020-09-16 08:47:15 [scrapy.extensions.telnet] INFO: Telnet Password: 47efc608d9467042
2020-09-16 08:47:15 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-09-16 08:47:16 [scrapy.middleware] INFO: Enabled item pipelines:
['demo1.pipelines.ziranweiyuanhuiPipline']
2020-09-16 08:47:16 [scrapy.core.engine] INFO: Spider opened
2020-09-16 08:47:16 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-16 08:47:16 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6026
2020-09-16 08:47:16 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'demo1',
'DOWNLOAD_DELAY': 1,
'LOG_FILE': 'logs/huojuzhongxin_2020_9.log',
'NEWSPIDER_MODULE': 'demo1.spiders',
'RETRY_HTTP_CODES': [500, 502, 503, 504, 400, 403, 404, 408, 302],
'RETRY_TIMES': True,
'SPIDER_MODULES': ['demo1.spiders']}

19
demo1/main.py

@ -3,5 +3,22 @@ from scrapy.utils.project import get_project_settings
if __name__ == '__main__':
process = CrawlerProcess(get_project_settings())
process.crawl('chacewangSpider') # 你需要将此处的spider_name替换为你自己的爬虫名称
# process.crawl('kexujishubuSpider') # 你需要将此处的spider_name替换为你自己的爬虫名称
# process.crawl('chacewangSpider')
# process.crawl('gongyehexinxihuabuSpider')#这个应该得携带上cookie,多试用一下这个
# process.crawl('ziranweiyuanhuiSpider')
# process.crawl('huojuzhognxinSpider')
# process.crawl('fagaiweiSpider')
# process.crawl('wenhuahelvyoubuSpider')
# process.crawl('zhongxiaoqiyejuSpider')
# process.crawl('cujinjuSpider')
# process.crawl('shanxishengkejitingSpider')
# process.crawl('sxsshangwutingSpider')
# process.crawl('sxgongxintingSpider')
# process.crawl('sxzonggaishifanquSpider')
# process.crawl('sxfagaiweiSpider')
# process.crawl('taiyuankexuejishujuSpider')
# process.crawl('taiyuangongyehexinxihuajuSpider')
# process.crawl('taiyuangongshangwujuSpider')
process.crawl('qicetongspider')
process.start()

207
demo1/pipelines.py

@ -15,8 +15,11 @@ from scrapy.utils.project import get_project_settings
import pymongo
from twisted.enterprise import adbapi
from DBUtils.PooledDB import PooledDB
from demo1.Util import Asyninser
from demo1.db_utils import MysqlUtil
from scrapy.utils.project import get_project_settings
import logging
class Demo1Pipeline:
def process_item(self, item, spider):
@ -159,8 +162,11 @@ class ProcessMysqlPipeline(object):
cursor.execute(insert_sql, (item['name'], item['base_url'], item['date'], item['coment'],))
#先走的是类方法,所以这里比初始话里面的语句要快的多。
#这里我们整个方法都用得是异步得方式得到得链接。
#查策网的
class MysqlYiBUPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@ -202,7 +208,7 @@ class MysqlYiBUPipeline(object):
query = self.dbpool.runInteraction(self.do_insert, asynItem) # 指定操作方法和操作数据
# 添加异常处理
query.addErrback(self.handle_error) # 处理异常
return item
def do_insert(self, cursor, item):
# 对数据库进行插入操作,并不需要commit,twisted会自动commit
#下面这个是一个模拟数据
@ -218,7 +224,7 @@ class MysqlYiBUPipeline(object):
# 'name1','name2','name3','name1','name2','name3','name1','name2','name3'
# ]
# }
logging.info(item)
#logging.info(item)
if item['leixing']=='申报通知':
item['leixing']=str(1)
elif item['leixing']=='公示公告':
@ -274,6 +280,201 @@ class MysqlYiBUPipeline(object):
def handle_error(self, failure):
if failure:
# 打印错误信息
logging.info('数据库插入异常信息--------:'+failure)
logging.info('----------数据库插入异常信息--------')
logging.info(failure)
logging.info('---------异常信息结束--------')
def close_spider(self, spider):
logging.info('爬虫运行完毕了')
#科学技术部的
class kexujishubuPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls,settings): # 函数名固定,会被scrapy调用,直接可用settings的值
"""
数据库建立连接
:param settings: 配置参数
:return: 实例化参数
"""
adbparams = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DATABASE'],
user=settings['MYSQL_USER'],
password=settings['MYSQL_PASSWORD'],
cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
)
# 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
# 返回实例化参数
return cls(dbpool)
def process_item(self, item, spider):
"""
使用twisted将MySQL插入变成异步执行通过连接池执行具体的sql操作返回一个对象
"""
asynItem = copy.deepcopy(item)
query = self.dbpool.runInteraction(self.do_insert, asynItem) # 指定操作方法和操作数据
# 添加异常处理
query.addErrback(self.handle_error) # 处理异常
return item
def do_insert(self, cursor, item):
# 对数据库进行插入操作,并不需要commit,twisted会自动commit
#下面这个是一个模拟数据
#logging.info(item)
sel_sql='''
select id from t_area where area_short is not null and area_short = '%s'
''' % item["laiyuan"]
cursor.execute(sel_sql)
result1=cursor.fetchall()
if len(result1)==0:
insert_sql = '''
insert into t_area(area_name,area_short,area_status,area_parent_id,area_type) values('%s','%s','%s','%s','%s')
''' % (str(item["laiyuan"]), str(item["laiyuan"]), str(1), str(1000000), str(1))
cursor.execute(insert_sql)
cursor.execute(sel_sql)
result1 = cursor.fetchall()
laiyuan_id=result1[0].get('id')
item["jianjie"] = '_'
insert_sql2='''
insert into t_policy(title,title_url,img_url,publish_depart_id,publish_time,content,intro)
values('%s','%s','%s','%s','%s','%s','%s')
''' % (str(item["biaoti"]),str(item['lianjie']),get_project_settings().get('TITLE_IMAGE')+str(random.randint(0,9))+'.png',str(laiyuan_id),item["shijian"],pymysql.escape_string(item["xiangqing"]),item["jianjie"])
sel_sql2='''
select id from t_policy where title_url='%s'
'''% (item["lianjie"])
cursor.execute(insert_sql2)
cursor.execute(sel_sql2)
result2 = cursor.fetchall()
xinwen_id=result2[-1].get('id')
item['biaoqian']=['_']
for dange_biaoqian in item['biaoqian']:
insert_sql3='''
insert into t_policy_label(policy_id,label_name) values('%s','%s')
'''% (str(xinwen_id),str(dange_biaoqian))
cursor.execute(insert_sql3)
if item.get('wenjian') is not None:
b = []
for a in item.get('wenjian'):
b.append(a['file_name'])
b.append(a['file_url'])
b.append(a['new_file'])
down_list_num = len(item.get('wenjian'))
insert_sql4 = 'insert into t_policy_file_crawl(policy_id,file_name,file_url,file_location) values' + \
str((('("' + str(xinwen_id) + '","{}","{}","{}"),') * down_list_num).rstrip(',')).format(*b)
cursor.execute(insert_sql4)
def handle_error(self, failure):
if failure:
# 打印错误信息
logging.info('----------数据库插入异常信息--------')
logging.info(failure)
logging.info('---------异常信息结束--------')
def close_spider(self, spider):
logging.info('爬虫运行完毕了')
#工业和信息化部
class gongyehexinxihuabuPipline(Asyninser):
def __init__(self,dbpool):
self.dbpool=dbpool
def do_insert(self, cursor, item):
sel_sql = '''
select id from t_area where area_short is not null and area_short = '%s'
''' % item["laiyuan"]
cursor.execute(sel_sql)
result1 = cursor.fetchall()
if len(result1)==0:
insert_sql = '''
insert into t_area(area_name,area_short,area_status,area_parent_id,area_type) values('%s','%s','%s','%s','%s')
''' % (str(item["laiyuan"]), str(item["laiyuan"]), str(1), str(1000000), str(1))
cursor.execute(insert_sql)
cursor.execute(sel_sql)
result1 = cursor.fetchall()
laiyuan_id = result1[0].get('id')
item["jianjie"] = '_'
insert_sql2 = '''
insert into t_policy(title,title_url,img_url,publish_depart_id,publish_time,content,intro)
values('%s','%s','%s','%s','%s','%s','%s')
''' % (str(item["biaoti"]), str(item['lianjie']),
get_project_settings().get('TITLE_IMAGE') + str(random.randint(0, 9)) + '.png',
str(laiyuan_id), item["shijian"], pymysql.escape_string(item["xiangqing"]),
item["jianjie"])
sel_sql2 = '''
select id from t_policy where title_url='%s'
''' % (item["lianjie"])
cursor.execute(insert_sql2)
cursor.execute(sel_sql2)
result2 = cursor.fetchall()
xinwen_id = result2[-1].get('id')
item['biaoqian'] = ['_']
for dange_biaoqian in item['biaoqian']:
insert_sql3 = '''
insert into t_policy_label(policy_id,label_name) values('%s','%s')
''' % (str(xinwen_id), str(dange_biaoqian))
cursor.execute(insert_sql3)
if item.get('wenjian') is not None:
b = []
for a in item.get('wenjian'):
b.append(a['file_name'])
b.append(a['file_url'])
b.append(a['new_file'])
down_list_num = len(item.get('wenjian'))
insert_sql4 = 'insert into t_policy_file_crawl(policy_id,file_name,file_url,file_location) values' + \
str((('("' + str(xinwen_id) + '","{}","{}","{}"),') * down_list_num).rstrip(',')).format(*b)
cursor.execute(insert_sql4)
#国家自然科学基金委员会 and 火炬中心
class ziranweiyuanhuiPipline(Asyninser):
def __init__(self,dbpool):
self.dbpool=dbpool
def do_insert(self, cursor, item):
sel_sql = '''
select id from t_area where (area_short is not null and area_short = '%s') or (area_name = '%s')
''' % (item["laiyuan"],item["laiyuan"])
cursor.execute(sel_sql)
result1 = cursor.fetchall()
if len(result1) == 0:
insert_sql = '''
insert into t_area(area_name,area_short,area_status,area_parent_id,area_type) values('%s','%s','%s','%s','%s')
''' % (str(item["laiyuan"]), str(item["laiyuan"]), str(1), str(1000000), str(1))
cursor.execute(insert_sql)
cursor.execute(sel_sql)
result1 = cursor.fetchall()
laiyuan_id = result1[0].get('id')
item["jianjie"] = '_'
insert_sql2 = '''
insert into t_policy(title,title_url,img_url,publish_depart_id,publish_time,content,intro)
values('%s','%s','%s','%s','%s','%s','%s')
''' % (str(item["biaoti"]), str(item['lianjie']),
get_project_settings().get('TITLE_IMAGE') + str(
random.randint(0, 9)) + '.png',
str(laiyuan_id), item["shijian"], pymysql.escape_string(item["xiangqing"]),
item["jianjie"])
sel_sql2 = '''
select id from t_policy where title_url='%s'
''' % (item["lianjie"])
cursor.execute(insert_sql2)
cursor.execute(sel_sql2)
result2 = cursor.fetchall()
xinwen_id = result2[-1].get('id')
item['biaoqian'] = ['_']
for dange_biaoqian in item['biaoqian']:
insert_sql3 = '''
insert into t_policy_label(policy_id,label_name) values('%s','%s')
''' % (str(xinwen_id), str(dange_biaoqian))
cursor.execute(insert_sql3)
if item.get('wenjian') is not None:
b = []
for a in item.get('wenjian'):
b.append(a['file_name'])
b.append(a['file_url'])
b.append(a['new_file'])
down_list_num = len(item.get('wenjian'))
insert_sql4 = 'insert into t_policy_file_crawl(policy_id,file_name,file_url,file_location) values' + \
str((('("' + str(xinwen_id) + '","{}","{}","{}"),') * down_list_num).rstrip(',')).format(*b)
cursor.execute(insert_sql4)

30
demo1/settings.py

@ -9,7 +9,8 @@
import datetime
import random
#如果要整个网站续爬就,把参数设置为False ,如果想整个网站都遍历一次那么就把参数改为True
RGODIC=True
# RGODIC=True
ISQUANPA=True
#scrapyd-deploy -p chacewang -v 0.0.0 --build-egg=noall_demo1.egg
BOT_NAME = 'demo1'
#我们文件下载的位置,这里我们就一个要求,我们得带上路径中最后的 / ,因为在代码中我们已经已经添加上了这个符号
@ -107,8 +108,17 @@ DOWNLOADER_MIDDLEWARES = {
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'demo1.pipelines.MongoDBPipeline': None,
#查策网的
'demo1.pipelines.MysqlYiBUPipeline': 678,
#科学技术部
'demo1.pipelines.kexujishubuPipeline':679,
#工信部
'demo1.pipelines.gongyehexinxihuabuPipline':680,
#自然科学基金委员会
'demo1.pipelines.ziranweiyuanhuiPipline':681,
#火炬中心-我估计他的pipline也一样,估计不用写,就用自然科学基金委员会得
#'demo1.pipelines.huojuzhongxinPipline': 682
}
# Enable and configure the AutoThrottle extension (disabled by default)
@ -148,16 +158,22 @@ MYSQL_PASSWORD = "sdfe@#$QW"
MYSQL_CHARSET = "utf8"
#日志
current_day = datetime.datetime.now()
LOG_ENABLED = True # 启用日志,默认不启用
LOG_ENCODING = 'utf-8'
LOG_FILE = "logs/{}_{}_{}.log".format(current_day.year, current_day.month, current_day.day)
#LOG_FILE = "logs/{}_{}_{}.log".format(current_day.year, current_day.month, current_day.day)
LOG_LEVEL = "DEBUG"
LOG_STDOUT = False # 输出重定向至log日志,比如print
#文件地址前缀
FILE_PATH="http://49.232.6.143/file/download/know?path=/home/enterprise/staticrec/policy"
FILE_PATH="https://www.sxwikionline.com/gateway/enterprise/file/download/know?path=/home/enterprise/staticrec/policy"
#图片地址前缀
MESSAGE="http://49.232.6.143/staticrec/policy"
TITLE_IMAGE="http://49.232.6.143/staticrec/policy/image/"
MESSAGE="https://www.sxwikionline.com/staticrec/policy"
#图标的地址前缀
TITLE_IMAGE="https://www.sxwikionline.com/staticrec/policy/image/"
# 这个设置项的意思是遇到这些错误码就重新发送请求,但是如果错误码不在这里就不会重新请求,所以一定要填写所有需要重新请求的情况。
# 如果想要遇到错误就忽略掉,从来都不重新请求,就把它设成等于[]就好了。
#这个框架中自己带的重试方法,我们先得开启重试功能
@ -165,4 +181,4 @@ RETRY_TIMES=True
#重试次数
RETRY_HTTP_CODECS=6
#哪些状态需要重试
RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 403, 404, 408]
RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 403, 404, 408, 302]

BIN
demo1/spiders/__pycache__/chacewangSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/fagaiweiSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/gongyehexinxihuabuSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/huojuzhongxinSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/kexujishubuSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/qicetongSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/shanxifagaiwei.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/shanxigongxintingSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/shanxishengkejitingSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/shanxishengshangwutingSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/shanxixiaoqiyecujinjuSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/shanxizonggaiquSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/taiyuangongyehexinxihuajuSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/taiyuanshangwujuSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/taiyuanshikexujishujuSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/wenhuahelvyoubuSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/zhongxiaoqiyejuSpider.cpython-37.pyc

Binary file not shown.

BIN
demo1/spiders/__pycache__/ziranweiyuanhuiSpider.cpython-37.pyc

Binary file not shown.

16
demo1/spiders/chacewangSpider.py

@ -4,6 +4,7 @@ import re
from uuid import uuid4
import copy
import datetime
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
import pymysql
@ -19,7 +20,7 @@ class ChacewangSpider(scrapy.Spider):
忻州currentCity :18B4119A-8390-4233-BDC5-F01F66CF8804
'''
allowed_domains = ['chacewang.com']
custom_settings = custom_settings_conf_chacewang
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
@ -32,7 +33,6 @@ class ChacewangSpider(scrapy.Spider):
use_unicode=True
)
self.cursor = self.db.cursor()
self.ergodic=get_project_settings().get('RGODIC')
def start_requests(self):
cityAndCookie=currenCitys= self.settings.get('CURRENT_CITY')
@ -106,7 +106,7 @@ class ChacewangSpider(scrapy.Spider):
#这里是判断我们要插入的一条链接是否已经存在了。所以我们要查库。
#应该这么想,直接在这里写我们得异步操作
if not self.ergodic:
if not self.settings.get('ISQUANPA'):
#续爬
self.cursor.execute('select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(url))
@ -114,7 +114,7 @@ class ChacewangSpider(scrapy.Spider):
#这里应该加一个值,判断次数而且是首页 下面这个是首页和第一页。要是还要向下判断就再前面加pageindex=2 in str(req_url).lower()
if res==0:
yield scrapy.FormRequest(url=item['lianjie'],
meta={'item': copy.deepcopy(item)},
meta={'item': item},
callback=self.parse_url,
method='GET'
)
@ -123,7 +123,7 @@ class ChacewangSpider(scrapy.Spider):
else:
#全爬
yield scrapy.FormRequest(url=item['lianjie'],
meta={'item': copy.deepcopy(item)},
meta={'item': item},
callback=self.parse_url,
method='GET'
)
@ -139,9 +139,9 @@ class ChacewangSpider(scrapy.Spider):
next_url=next_urls[-2].xpath('.//a/@href').extract_first()
if 'javascript:void(0)' not in next_url:
urls=self.settings.get('WANGZHI')+next_url
if not self.ergodic:
if not self.settings.get('ISQUANPA'):
# 这一句就是我们判断是否执行下一页的语句,要爬多少页的数据,去掉这个if语句的话就可以遍历整个网站了。
if ('pageindex=1' in str(req_url).lower()) or ('pageindex' not in str(req_url)):
if (('pageindex' not in str(req_url))): #'pageindex=1' in str(req_url).lower()) or
yield scrapy.FormRequest(
url=urls,
callback=self.parse,
@ -269,4 +269,4 @@ class ChacewangSpider(scrapy.Spider):
sub = uuid[i * 4: i * 4 + 4]
x = int(sub, 16)
result += uuidChars[x % 0x3E]
return result
return result

83
demo1/spiders/fagaiweiSpider.py

@ -0,0 +1,83 @@
import scrapy
from uuid import uuid4
import re
from scrapy.spiders import CrawlSpider
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
class fagaiweiSpider(scrapy.Spider,Util_WANG):
name='fagaiweiSpider'
settings = get_project_settings()
allowed_domains = ['ndrc.gov.cn']
custom_settings = custom_settings_conf_fagaiwei
start_urls=['https://www.ndrc.gov.cn/xxgk/zcfb/tz/index.html']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
lis=response.xpath('//*[@class="list"]/ul/li[not(@class="empty")]')
for lis_sigl in lis:
item=Shouyelianjie()
item['biaoti']=lis_sigl.xpath('./a/@title').extract_first()
item['shijian']=lis_sigl.xpath('.//span/text()').extract_first().replace('/','-')
item['lianjie']='https://www.ndrc.gov.cn/xxgk/zcfb/tz'+lis_sigl.xpath('.//a/@href').extract_first().strip('.')
item['laiyuan']='发改委'
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
next_page = response.xpath('//*[@class="page"]//script').re('\d{1,2}.*?,.*?\d{1,2}')[0].split(',')
count_page = int(next_page[0].strip())
curry_page = int(next_page[-1].strip())+1
if curry_page < count_page:
urls = 'https://www.ndrc.gov.cn/xxgk/zcfb/tz/index_' + str(curry_page) + '.html'
yield scrapy.Request(url=urls, callback=self.parse)
except Exception as e:
logging(e)
logging.info('因为异常:全部爬取完毕')
else:
logging.info('全部爬取完毕')
# 测试页面的直接
# urls='https://www.ndrc.gov.cn/xxgk/zcfb/tz/202004/t20200414_1225669.html'
# yield scrapy.Request(url=urls,callback=self.page_url,meta={'item':Shouyelianjie()})
def page_url(self,response):
item=response.meta['item']
txts=response.xpath('//*[@class="article_l"]/*[not(contains(@class,"shezhi"))]')
a=''
for txt in txts:
a+=txt.extract()
item['xiangqing']=a
self.tihuan_a_return(item, self.settings.get('FILE_PATH'),response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'),response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass

112
demo1/spiders/gongyehexinxihuabuSpider.py

@ -0,0 +1,112 @@
import scrapy
from uuid import uuid4
import re
from scrapy.spiders import CrawlSpider
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
import pymysql
import logging
import json
import time
from urllib import parse
from demo1.Util import Util_WANG
class gongyehexinxihuabuSpider(scrapy.Spider,Util_WANG):
name = 'gongyehexinxihuabuSpider'
settings = get_project_settings()
allowed_domains = ['miit.gov.cn']
custom_settings = custom_settings_conf_gongyehexinxihuabu
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def start_requests(self):
url='http://www.miit.gov.cn/gdnps/wjfbindex.jsp'
yield scrapy.Request(url=url,callback=self.dierci_requests)
def dierci_requests(self,response):
url = self.return_url()
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
txt=response.text
txt=txt[txt.find('{'):txt.rfind('}')+1]
txt=json.loads(txt)
curPage = txt['curPage']
totalpagenum = txt['totalPageNum']
for ac in txt['resultMap']:
item=Shouyelianjie()
item['biaoti']=ac['title']
res_time=datetime.datetime.strptime(ac['publishTime'],'%Y%m%d%H%M%S').strftime('%Y-%m-%d')
item['shijian']=res_time
buafter=''
try:
buafter=ac['fbjgmc']
except:
logging.info('没有这个字段')
item['laiyuan']='工信部'+buafter
chushi_url="http://www.miit.gov.cn"+ac['ownSubjectDn'].replace("/1/29/","/").replace('/',"/n")+"/c"+ac['id']+"/content.html"
item['lianjie']=chushi_url
item['xiangqing']=ac['htmlContent']
#html=etree.HTML(ac['htmlContent'])
#self.tihuan_a_return(item,ac['htmlContent'],self.settings.get('FILE_PATH'))
self.tihuan_a_return(item,self.settings.get('FILE_PATH'))
self.tihuan_img_return(item,self.settings.get('MESSAGE'))
if not self.settings.get('ISQUANPA'):
# 续爬解析页面
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
yield item
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
yield item
if self.settings.get('ISQUANPA'):
if curPage<totalpagenum:
yield scrapy.Request(url=self.return_url(curr_page=curPage+1),callback=self.parse)
def a_fun(self, href):
return href
def img_fun(self, src):
return src
def return_url(self,size=10,curr_page=1):
start_url = 'http://www.miit.gov.cn/gdnps/searchIndex.jsp'
curr_time = time.time()
size = size
curr_page = curr_page
params = {
"goPage": curr_page,
"orderBy": [
{
"orderBy": "publishTime",
"reverse": "true"
},
{
"orderBy": "orderTime",
"reverse": "true"
}
],
"pageSize": size,
"queryParam": [
{},
{},
{
"shortName": "fbjg",
"value": "/1/29/1146295/1652858/1652930"
}
]
}
d = time.time()
d_int = int(round(d * 1000))
jquery = 'jQuery111108461701558527148_' + str(d_int)
params = json.dumps(params).replace(' ', '').replace('"true"', 'true')
url = start_url + "?params=" + parse.quote(parse.quote(params)).replace('/','%252F') + '&callback=' + jquery + '&_=' + str(d_int + 1)
return url

74
demo1/spiders/huojuzhongxinSpider.py

@ -0,0 +1,74 @@
import scrapy
from uuid import uuid4
import re
from scrapy.spiders import CrawlSpider
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
class huojuzhognxinSpider(scrapy.Spider,Util_WANG):
name='huojuzhognxinSpider'
settings = get_project_settings()
allowed_domains = ['chinatorch.gov.cn']
custom_settings = custom_settings_conf_huojuzhognxin
start_urls=['http://www.chinatorch.gov.cn/kjb/tzgg/list.shtml']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
lis=response.xpath('//*[@class="list_con"]/li')
for li in lis:
item=Shouyelianjie()
item['shijian']=li.xpath('.//*[@class="list_time"]/text()').extract_first()
item['biaoti']=li.xpath('.//a/@title').extract_first()
item['laiyuan']='科技部火炬中心'
item['lianjie']='http://www.chinatorch.gov.cn'+li.xpath('.//a/@href').extract_first()
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
yield scrapy.Request(url=item['lianjie'],meta={'item':item},callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
next_page=response.xpath('//ul[@class="list_con"]/script[not(@src and @type)]').re(',.*?\d+.*?,.*?\d.*?,')[0].strip(',').split(',')
count_page=int(next_page[0].strip())
curry_page=int(next_page[-1].strip())
if curry_page<count_page:
urls='http://www.chinatorch.gov.cn/kjb/tzgg/list_'+str(curry_page+1)+'.shtml'
yield scrapy.Request(url=urls,callback=self.parse)
except Exception as e:
logging.info(e)
else:
logging.info('全部爬取完毕')
def page_url(self,response):
item=response.meta['item']
txt=response.xpath('//div[contains(@class,"pages_content") and contains(@id,"content")]').extract_first()
item['xiangqing']=txt.replace('\u3000','')
self.tihuan_a_return(item, self.settings.get('FILE_PATH'))
self.tihuan_img_return(item, self.settings.get('MESSAGE'))
yield item
def a_fun(self,href):
return 'http://www.chinatorch.gov.cn'+href
def img_fun(self, src):
return 'http://www.chinatorch.gov.cn'+src

136
demo1/spiders/kexujishubuSpider.py

@ -0,0 +1,136 @@
import scrapy
from uuid import uuid4
import re
from scrapy.spiders import CrawlSpider
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
import pymysql
import logging
class kexujishubuSpider(CrawlSpider):
name = 'kexujishubuSpider'
settings=get_project_settings()
allowed_domains = ['most.gov.cn']
custom_settings =custom_settings_conf_kexujishubu
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
self.settings =get_project_settings()
def start_requests(self):
start_url='http://www.most.gov.cn/tztg/index.htm'
yield scrapy.Request(url=start_url, callback=self.parse)
#测试页面的额
# ceshiwenzhang='http://www.most.gov.cn/tztg/201901/t20190107_144549.htm'
# item = Shouyelianjie()
# item['biaoti']='国家遥感中心2018年面向社会公开招聘拟聘用人员公示'
# item['lianjie']=ceshiwenzhang
# item['shijian']='2019-01-04'
# yield scrapy.Request(url=ceshiwenzhang,callback=self.parse_url,meta={'item':item})
def parse(self, response):
text=response.text
panDuanNone = lambda x: '_' if x is None else x
currentPage_var=re.search('var.*?currentPage.*?=.*?\d+',text).group(0)
currentPage = int(currentPage_var[currentPage_var.find('=') + 1:].strip())
countPage_var=re.search('var.*?countPage.*?=.*?\d+',text).group(0)
countPage=int(countPage_var[countPage_var.find('=')+1:].strip())
tables=response.xpath('//td[@class="STYLE30"]')
for table in tables :
item = Shouyelianjie()
item['biaoti']=table.xpath('.//a/text()').extract_first()
item['lianjie']='http://www.most.gov.cn/tztg'+table.xpath('.//a/@href').extract_first().strip('.')
item['shijian']=re.findall('(\d{4}-\d{1,2}-\d{1,2})',table.xpath('string(.)').extract_first())[-1]
if not self.settings.get('ISQUANPA'):
#续爬解析页面
self.cursor.execute('select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res==0:
yield scrapy.FormRequest(url=item['lianjie'],
meta={'item': item},
callback=self.parse_url,
method='GET'
)
else:
logging.info('这个链接已经爬过了-----:'+item['lianjie'])
else:
#全爬解析页面
yield scrapy.FormRequest(url=item['lianjie'],
meta={'item': item},
callback=self.parse_url,
method='GET'
)
if self.settings.get('ISQUANPA'):
#全爬
if currentPage+1<countPage:
new_url='http://www.most.gov.cn/tztg/index_'+str(currentPage+1)+'.htm'
yield scrapy.FormRequest(url=new_url,
callback=self.parse
)
#续爬不用写是因为这个页面就是更新得慢我们看首页就行了
def parse_url(self,response):
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
current_url=response.url
item=response.meta['item']
item['laiyuan']='科技部'
#() | (//meta[@name="ContentEnd"]/preceding-sibling::*)
a1=response.xpath('//meta[@name="ContentStart"]/following-sibling::*')
for a_i,a_value in enumerate(a1):
c=a_value.xpath('.//@name')
if len(c)>0 and (str(c.extract_first()).lower()=="ContentEnd".lower()):
b=a_i
a2=a1[0:a_i-1:]
a_suoyou=response.xpath('//*[@id="Zoom"]//a[@href and (' + self.jiewei_contains() + ')]')
als = response.xpath('//*[@id="Zoom"]')[0].re('<meta.*name="ContentStart".*[\s\S]*<meta.*name="ContentEnd">')[0]
als = str(als)
txt = als[als.find('name="ContentStart"') + len('name="ContentStart">'):als.rfind('<meta')]
for a_suoyou_i,a_suoyou_value in enumerate(a_suoyou):
single_a_file={}
single_a_file['file_name']=a_suoyou_value.xpath('string(.)').extract_first()
old_url=a_suoyou_value.xpath('@href').extract_first()
single_a_file['file_url']=current_url[0:current_url.rfind('/')]+old_url.strip('.')
houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + self.short_uuid() + '_' + houzui
txt=txt.replace(old_url,self.settings.get('FILE_PATH')+new_url)
single_a_file['new_file']=new_url
try:
item['wenjian'].append( single_a_file)
except:
item['wenjian'] =[single_a_file]
item['xiangqing']=txt.strip('\n').strip().replace('\u3000',' ').replace('\xa0',' ')
#context_all= etree.HTML(response.text).xpath('//meta[@name="ContentStart"]/following-sibling::*[name(.)!="table" and name(.)!="meta"]')
yield item
def jiewei_contains(self):
str = ''
jiewei = ['.doc', '.xls', '.docx', '.xlsx','.txt','.rar','.zip']
for j in jiewei:
str += 'contains(@href,"' + j + '")' + ' or '
str = str.strip().strip('or').strip()
return str
def short_uuid(self):
uuidChars = ("a", "b", "c", "d", "e", "f",
"g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
"t", "u", "v", "w", "x", "y", "z", "0", "1", "2", "3", "4", "5",
"6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I",
"J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V",
"W", "X", "Y", "Z")
uuid = str(uuid4()).replace('-', '')
result = ''
for i in range(0, 8):
sub = uuid[i * 4: i * 4 + 4]
x = int(sub, 16)
result += uuidChars[x % 0x3E]
return result

74
demo1/spiders/qicetongSpider.py

@ -0,0 +1,74 @@
import scrapy
from uuid import uuid4
import re
from scrapy.spiders import CrawlSpider
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
class qicetong(scrapy.Spider,Util_WANG):
name = 'qicetongspider'
settings = get_project_settings()
allowed_domains = ['easyshb.com']
custom_settings = custom_settings_conf_qicetongSpider
start_urls=['http://www.easyshb.com/alias/zck.htm?categoryId=&name=&areaLevel=&cityId=&noticeType=M7EkvSokQa3QVgX6WFf5LP&policyType=']
#start_urls=['http://www.easyshb.com/alias/zck.htm?categoryId=&name=&pageNumber=76&areaLevel=&cityId=&noticeType=M7EkvSokQa3QVgX6WFf5LP&policyType=']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
a_lis=response.xpath('//div[contains(@class,"list_flex")]/a')
for a_lis_sign in a_lis:
item=Shouyelianjie()
item['diqu']=1000008
item['biaoti']=a_lis_sign.xpath('.//h3/text()').extract_first()
item['shijian']=a_lis_sign.xpath('.//p[@class="f14 fc-gray mt-5"]/text()').extract_first()
item['jianjie']=a_lis_sign.xpath('.//p[@class="ellipsis3 fc-gray mt-10"]//text()').extract_first()
item['laiyuan']='来源未知'
item['lianjie']=response.urljoin(a_lis_sign.xpath('./@href').extract_first())
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
next_page_num = response.xpath('//script[contains(text(),"laypage")]').re("pages.*?[\s\S]*?curr.*?\"\d*\"?")[0]
s=re.findall('\d+',next_page_num)
pages=int(s[0])
curr=int(s[1])
if pages>curr:
next_page='http://www.easyshb.com/alias/zck.htm?categoryId=&name=&pageNumber='+str(curr+1)+'&areaLevel=&cityId=&noticeType=M7EkvSokQa3QVgX6WFf5LP&policyType='
yield scrapy.Request(url=next_page, callback=self.parse)
except:
logging.info('全部爬取完毕')
def page_url(self,response):
item=response.meta['item']
item['xiangqing'] = response.xpath('//div[contains(@class,"content")]').extract_first()
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass

72
demo1/spiders/shanxifagaiwei.py

@ -0,0 +1,72 @@
import scrapy
from uuid import uuid4
import re
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
#山西省发改委
class sxfagaiweiSpider(scrapy.Spider,Util_WANG):
name = 'sxfagaiweiSpider'
settings = get_project_settings()
allowed_domains = ['fgw.shanxi.gov.cn']
custom_settings = custom_settings_conf_sxfagaiweiSpider
start_urls = ['http://fgw.shanxi.gov.cn/xmsb/']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
lis=response.xpath('//div[@class="list-block-wrap"]//ul[contains(@class,"content")]/li')
for li in lis:
item=Shouyelianjie()
item['laiyuan']='山西省发改委'
item['lianjie']=response.urljoin(li.xpath('.//a/@href').extract_first())
item['biaoti']=li.xpath('.//a/text()').extract_first()
item['shijian']=li.xpath('.//em/text()').extract_first()
#测试
#item['lianjie']='http://fgw.shanxi.gov.cn/fggz/wngz/wzzs/202006/t20200604_124281.shtml'
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
pass
def page_url(self,response):
item=response.meta['item']
item['xiangqing']=response.xpath('//div[@class="TRS_Editor"]').extract_first()
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass

89
demo1/spiders/shanxigongxintingSpider.py

@ -0,0 +1,89 @@
import scrapy
from uuid import uuid4
import re
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
#山西省工业和信息厅
class sxsshangwutingSpider(scrapy.Spider,Util_WANG):
name = 'sxgongxintingSpider'
settings = get_project_settings()
allowed_domains = ['gxt.shanxi.gov.cn']
custom_settings = custom_settings_conf_sxgongxintingSpider
start_urls = ['http://gxt.shanxi.gov.cn/web/cateList.html?id=26&word=&pageIndex=1']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
lis=response.xpath('//*[@class="zwgk-ul"]/li[not(@style)]')
for li in lis:
item=Shouyelianjie()
item['laiyuan']='山西省工业和信息厅'
item['lianjie']=response.urljoin( li.xpath('./a/@href').extract_first())
shijian=li.xpath('./i/text()').extract_first()
item['shijian']=datetime.datetime.strptime(shijian,'%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
item['biaoti']=li.xpath('./a/text()').extract_first()
#测试用得
#item['lianjie']='http://gxt.shanxi.gov.cn/web/content.html?id=1511'
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
next_page=response.urljoin(response.xpath('//a[contains(text(),"下一页")]/@href').extract_first())
# if curry_page < count_page:
# urls = 'https://www.mct.gov.cn/whzx/ggtz/index_' + str(curry_page) + '.htm'
# yield scrapy.Request(url=urls, callback=self.parse)
current_url=response.url
if next_page!=current_url:
yield scrapy.Request(url=next_page, callback=self.parse)
except Exception as e:
logging(e)
logging.info('因为异常:全部爬取完毕')
else:
logging.info('全部爬取完毕')
def page_url(self,response):
item=response.meta['item']
item['xiangqing']=response.xpath('//*[contains(@class,"textbody")]').extract_first()
wenjiande=response.xpath('//*[@class="attachment"]').extract_first()
if wenjiande is not None:
item['xiangqing']+=wenjiande
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass

96
demo1/spiders/shanxishengkejitingSpider.py

@ -0,0 +1,96 @@
import scrapy
from uuid import uuid4
import re
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
#山西省科技厅
class kejitingSpider(scrapy.Spider,Util_WANG):
name = 'shanxishengkejitingSpider'
settings = get_project_settings()
allowed_domains = ['kjt.shanxi.gov.cn']
custom_settings = custom_settings_conf_sxkejitingSpider
start_urls = ['http://kjt.shanxi.gov.cn/tzgg/index.jhtml']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
lis=response.xpath('//*[@align="center"]//tr[not(@class)]')
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
for li in lis:
item=Shouyelianjie()
item['lianjie']=response.urljoin(li.xpath('.//a/@href').extract_first())
item['shijian']=li.xpath('.//td')[-3].xpath('./text()').extract_first().replace('.','-')
item['biaoti']=li.xpath('.//a/text()').extract_first()
item['laiyuan']='山西省科技厅'+li.xpath('.//td')[-2].xpath('./text()').extract_first()
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
if not (item['lianjie'].endswith('.jhtml') or item['lianjie'].endswith('.html')):
item['wenjian']=[{'file_name':'原文件'}]
item['wenjian'][0]['file_url']=item['lianjie']
houzui = item['wenjian'][0]['file_url'][item['wenjian'][0]['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + self.short_uuid() + '_' + houzui
item['wenjian'][0]['new_file']=new_url
item['xiangqing'] = '<div><p>请查看原文附件:<a href="' +self.settings.get('FILE_PATH')+new_url +'">原文件</a></p></div>'
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_item)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
if not (item['lianjie'].endswith('.jhtml') or item['lianjie'].endswith('.html')):
item['wenjian'] = [{'file_name': '原文件'}]
item['wenjian'][0]['file_url'] = item['lianjie']
houzui = item['wenjian'][0]['file_url'][item['wenjian'][0]['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + self.short_uuid() + '_' + houzui
item['wenjian'][0]['new_file'] = new_url
item['xiangqing'] = '<div><p>请查看原文附件:<a href="' + self.settings.get(
'FILE_PATH') + new_url + '">原文件</a></p></div>'
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_item)
if self.settings.get("ISQUANPA"):
try:
next_page = response.xpath('//a[text()="下一页"]/@href').extract_first()
if next_page is not None:
yield scrapy.Request(url='http://kjt.shanxi.gov.cn/tzgg/' + next_page,
callback=self.parse)
else:
logging.info('所有的结束')
except:
logging.info('全部爬取完毕')
def page_item(self,response):
item=response.meta['item']
item['xiangqing']=response.xpath('//*[@id="zoom"]').extract_first()
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass

183
demo1/spiders/shanxishengshangwutingSpider.py

@ -0,0 +1,183 @@
import scrapy
from uuid import uuid4
import re
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
#山西省商务厅
class sxsshangwutingSpider(scrapy.Spider,Util_WANG):
name = 'sxsshangwutingSpider'
settings = get_project_settings()
allowed_domains = ['swt.shanxi.gov.cn']
custom_settings = custom_settings_conf_sxShangwutingSpider
start_urls = ['http://swt.shanxi.gov.cn/Main/list.action?channelId=27']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
pages=response.xpath('//*[@class="pgTotalPage"]/text()').extract_first()
script=response.xpath('//script[contains(text(),"listTable.filter.channelId")]/text()').extract_first().lower()
size=re.search('pagesize.*?=.*?\d+',script).group().replace('pagesize','').replace('=','').strip()
pageCount=re.search('pagecount.*?=.*?\d+',script).group().replace(r'pagecount','').replace('=','').strip()
lis=response.xpath('//*[@class="t_text"]//li')
for li in lis:
item=Shouyelianjie()
item['lianjie']=response.urljoin(li.xpath('.//a/@href').extract_first())
item['laiyuan']='山西省商务厅'
item['biaoti']=li.xpath('.//a/@title').extract_first()
item['shijian']=li.xpath('.//span/text()').extract_first()
# 测试链接得用
#item['lianjie'] = 'http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=369d459b-a799-4e8a-87b7-8cd6c5cfc371'
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
pageCount=int(pageCount)
for page_next in range(2,pageCount+1):
url_next='http://swt.shanxi.gov.cn/Main/list.action?ajax=true&pageCount='+str(pageCount)+'&pageSize='+size+'&page='+str(page_next)+'&channelId=27'
yield scrapy.Request(url=url_next, callback=self.page_next_url)
except Exception as e:
logging(e)
logging.info('因为异常:全部爬取完毕')
else:
logging.info('全部爬取完毕')
def page_next_url(self,response):
context_json=json.loads(response.text)
context=context_json['content']
context_html=etree.HTML(context)
lis=context_html.xpath('//ul/li')
for li in lis:
item=Shouyelianjie()
item['lianjie']=response.urljoin(li.xpath('.//a/@href')[0])
item['laiyuan']='山西省商务厅'
item['biaoti']=li.xpath('.//a/@title')[0]
item['shijian']=li.xpath('.//span/text()')[0]
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
def page_url(self,response):
item=response.meta['item']
item['xiangqing']=response.xpath('//div[@id="zoom"]').extract_first()
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
#需要特别定制一个了
self.dingzhi_tihuan_a(item,self.settings.get('FILE_PATH'),response)
yield item
def dingzhi_tihuan_a(self,item, tihuanlujing,response=None):
txt = item['xiangqing']
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
panDuanNone = lambda x: '_' if x is None else x
html = etree.HTML(txt)
alis = html.xpath('//a[@href and contains(@href,"getFile.action?fileId")]')
for alis_single in alis:
single_a_file = {}
href = str(alis_single.xpath('@href')[0])
content = str(panDuanNone(alis_single.xpath('string(.)')))
if content.strip() in '':
content = '_'
single_a_file['file_name'] = content
# 每次只需要修改这里我们实际的下载链接地址
old_url = href
if href.lower().startswith('http'):
single_a_file['file_url'] = old_url
elif response != None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')):
single_a_file['file_url'] = response.urljoin(old_url)
elif response != None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')):
single_a_file['file_url'] = response.urljoin(old_url)
#houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:]
houzui=single_a_file['file_url'][single_a_file['file_url'].rfind('=') + 1:]
new_url = '/' + year + '/' + mouth + '/' + self.short_uuid() +'_' + houzui+'.'
txt = txt.replace(old_url, tihuanlujing + new_url)
single_a_file['new_file'] = new_url
try:
item['wenjian'].append(single_a_file)
except:
item['wenjian'] = [single_a_file]
item['xiangqing'] = txt
def a_fun(self,href):
pass
def img_fun(self, src):
pass
def return_url(self, size=10, curr_page=1):
start_url = 'http://www.miit.gov.cn/gdnps/searchIndex.jsp'
curr_time = time.time()
size = size
curr_page = curr_page
params = {
"goPage": curr_page,
"orderBy": [
{
"orderBy": "publishTime",
"reverse": "true"
},
{
"orderBy": "orderTime",
"reverse": "true"
}
],
"pageSize": size,
"queryParam": [
{},
{},
{
"shortName": "fbjg",
"value": "/1/29/1146295/1652858/1652930"
}
]
}
d = time.time()
d_int = int(round(d * 1000))
jquery = 'jQuery111108461701558527148_' + str(d_int)
params = json.dumps(params).replace(' ', '').replace('"true"', 'true')
url = start_url + "?params=" + parse.quote(parse.quote(params)).replace('/',
'%252F') + '&callback=' + jquery + '&_=' + str(
d_int + 1)
return url

80
demo1/spiders/shanxixiaoqiyecujinjuSpider.py

@ -0,0 +1,80 @@
import scrapy
from uuid import uuid4
import re
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
#山西省中小企业发展促进局
class cujinjuSpider(scrapy.Spider,Util_WANG):
name = 'cujinjuSpider'
settings = get_project_settings()
allowed_domains = ['xqyj.shanxi.gov.cn']
custom_settings = custom_settings_conf_cujinjuSpider
start_urls = ['http://xqyj.shanxi.gov.cn/v2/html/tzgg/index.html']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
lis=response.xpath('//*[@class="page_list"]//li')
for li in lis:
item=Shouyelianjie()
item['biaoti']=li.xpath('./a/@title').extract_first()
item['lianjie']=response.urljoin(li.xpath('./a/@href').extract_first())
item['laiyuan']='山西省小企业促进局'
item['shijian']=li.xpath('./sapn/text()').extract_first()
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
yield scrapy.Request(url=item['lianjie'],meta={'item':item},callback=self.page_item)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_item)
if self.settings.get("ISQUANPA"):
try:
next_page=response.xpath('//a[@class="next-page"]/@onclick').re('\'index.*')
if len(next_page)>0:
next_page[0]=next_page[0].strip('\'')
yield scrapy.Request(url='http://xqyj.shanxi.gov.cn/v2/html/tzgg/'+next_page[0],callback=self.parse)
except:
logging.info('全部爬取完毕')
def page_item(self,response):
item=response.meta['item']
item['xiangqing']=response.xpath('//*[@class="doc_content"]').extract_first().replace('192.168.143.1','xqyj.shanxi.gov.cn')
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
# if item.get('wenjian') is not None:
# for wenjians in item['wenjian'][:]:
# if '_' in wenjians['file_name']:
# self.cursor.execute(
# 'select count(file_url) as nums FROM t_policy_file_crawl where file_url ="{}"'.format(wenjians['file_url']))
# res = self.cursor.fetchall()[0].get('nums')
# if res != 0:
# item['file_name'].remove(wenjians)
# logging.info(item)
yield item
def a_fun(self,href):
print()
def img_fun(self, src):
print()

86
demo1/spiders/shanxizonggaiquSpider.py

@ -0,0 +1,86 @@
import scrapy
from uuid import uuid4
import re
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
#山西省综改示范区
class sxszonggaishifanquSpider(scrapy.Spider,Util_WANG):
name = 'sxzonggaishifanquSpider'
settings = get_project_settings()
allowed_domains = ['zgq.shanxi.gov.cn']
custom_settings = custom_settings_conf_sxzonggaishifanSpider
start_urls = ['https://zgq.shanxi.gov.cn/?pcyear=8-10']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
alis=response.xpath('//div[@class="bszn"]/a')
for a_sign in alis:
item=Shouyelianjie()
item['shijian']=a_sign.xpath('.//font/text()').extract_first()
item['biaoti']=a_sign.xpath('.//span/text()').extract_first()
item['lianjie']=response.urljoin(a_sign.xpath('./@href').extract_first())
item['laiyuan']='山西转型综合改革示范区管理委员会'
#ceshi
#item['lianjie']='https://zgq.shanxi.gov.cn/?pcyear=8-10&id=7203'
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
next_page=response.xpath('//div[@class="page"]//a[@href and contains(text(),">") and not(contains(text(),">>"))]/@href').extract_first()
if next_page is not None:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)
# if curry_page < count_page:
# urls = 'https://www.mct.gov.cn/whzx/ggtz/index_' + str(curry_page) + '.htm'
# yield scrapy.Request(url=urls, callback=self.parse)
else:
logging.info('最后一页了。。。。')
except Exception as e:
logging(e)
logging.info('因为异常:全部爬取完毕')
else:
logging.info('全部爬取完毕')
def page_url(self,response):
item=response.meta['item']
item['xiangqing']=response.xpath('//*[@class="newscontent"]').extract_first()
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass

97
demo1/spiders/taiyuangongyehexinxihuajuSpider.py

@ -0,0 +1,97 @@
import scrapy
from uuid import uuid4
import re
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
#太原市工业和信息话局
class taiyuangongyehexinxihuajuSpider(scrapy.Spider,Util_WANG):
name = 'taiyuangongyehexinxihuajuSpider'
settings = get_project_settings()
allowed_domains = ['jxw.taiyuan.gov.cn']
custom_settings = custom_settings_conf_taiyuangongyehexinxihuajuSpider
start_urls = ['http://jxw.taiyuan.gov.cn/zfxxgk/gggs/index.shtml']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
lis = response.xpath('//ul[@class="List_list"]/li')
for li in lis:
item = Shouyelianjie()
item['lianjie'] = response.urljoin(li.xpath('.//a/@href').extract_first())
item['biaoti'] = li.xpath('.//a/@title').extract_first()
if item['biaoti'] is None:
item['biaoti'] = li.xpath('.//a/text()').extract_first()
item['shijian'] = li.xpath('.//span/text()').extract_first()
item['laiyuan'] = '太原市工业和信息化局'
# 测试用的
#item['lianjie']='http://jxw.taiyuan.gov.cn/doc/2020/03/27/965251.shtml'
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
nums = response.xpath('//*[@id="pages"]').re('{.*?pageIndex\":.*?pageCount\":.*?pageSize\":.*?}')[0]
nums = eval(nums)
pageIndex = int(nums['pageIndex'])
pageCount = int(nums['pageCount'])
next_page = pageIndex + 1
# if curry_page < count_page:
# urls = 'https://www.mct.gov.cn/whzx/ggtz/index_' + str(curry_page) + '.htm'
# yield scrapy.Request(url=urls, callback=self.parse)
if next_page <= pageCount:
ac='http://jxw.taiyuan.gov.cn/zfxxgk/gggs/index_' + str(next_page) + '.shtml'
yield scrapy.Request(url='http://jxw.taiyuan.gov.cn/zfxxgk/gggs/index_' + str(next_page) + '.shtml',
callback=self.parse)
else:
logging.info('全部爬完了')
except Exception as e:
logging(e)
logging.info('因为异常:全部爬取完毕')
else:
logging.info('全部爬取完毕')
def page_url(self, response):
item = response.meta['item']
item['xiangqing'] = response.xpath('//*[@id="Zoom"]').extract_first()
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass

97
demo1/spiders/taiyuanshangwujuSpider.py

@ -0,0 +1,97 @@
import scrapy
from uuid import uuid4
import re
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
#太原市商务局
class taiyuangongshangwujuSpider(scrapy.Spider,Util_WANG):
name = 'taiyuangongshangwujuSpider'
settings = get_project_settings()
allowed_domains = ['sswj.taiyuan.gov.cn']
custom_settings = custom_settings_conf_taiyuanshangwujuSpider
start_urls = ['http://sswj.taiyuan.gov.cn/zfxxgk/tzgg/index.shtml']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
lis = response.xpath('//ul[@class="List_list"]/li')
for li in lis:
item = Shouyelianjie()
item['lianjie'] = response.urljoin(li.xpath('.//a/@href').extract_first())
item['biaoti'] = li.xpath('.//a/@title').extract_first()
if item['biaoti'] is None:
item['biaoti'] = li.xpath('.//a/text()').extract_first()
item['shijian'] = li.xpath('.//span/text()').extract_first()
item['laiyuan'] = '太原市商务局'
# 测试用的
#item['lianjie']='http://jxw.taiyuan.gov.cn/doc/2020/03/27/965251.shtml'
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
nums = response.xpath('//*[@id="pages"]').re('{.*?pageIndex\":.*?pageCount\":.*?pageSize\":.*?}')[0]
nums = eval(nums)
pageIndex = int(nums['pageIndex'])
pageCount = int(nums['pageCount'])
next_page = pageIndex + 1
# if curry_page < count_page:
# urls = 'https://www.mct.gov.cn/whzx/ggtz/index_' + str(curry_page) + '.htm'
# yield scrapy.Request(url=urls, callback=self.parse)
if next_page <= pageCount:
ac='http://sswj.taiyuan.gov.cn/zfxxgk/tzgg/index_' + str(next_page) + '.shtml'
yield scrapy.Request(url='http://sswj.taiyuan.gov.cn/zfxxgk/tzgg/index_' + str(next_page) + '.shtml',
callback=self.parse)
else:
logging.info('全部爬完了')
except Exception as e:
logging(e)
logging.info('因为异常:全部爬取完毕')
else:
logging.info('全部爬取完毕')
def page_url(self, response):
item = response.meta['item']
item['xiangqing'] = response.xpath('//*[@id="Zoom"]').extract_first()
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass

92
demo1/spiders/taiyuanshikexujishujuSpider.py

@ -0,0 +1,92 @@
import scrapy
from uuid import uuid4
import re
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
#太原市科学技术局
class taiyuankexuejishujuSpider(scrapy.Spider,Util_WANG):
name = 'taiyuankexuejishujuSpider'
settings = get_project_settings()
allowed_domains = ['kjj.taiyuan.gov.cn']
custom_settings = custom_settings_conf_taiyuankexuejishujuSpider
start_urls = ['http://kjj.taiyuan.gov.cn/zfxxgk/gggs/index.shtml']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
lis=response.xpath('//ul[@class="List_list"]/li')
for li in lis:
item=Shouyelianjie()
item['lianjie']=response.urljoin(li.xpath('.//a/@href').extract_first())
item['biaoti']=li.xpath('.//a/text()').extract_first()
item['shijian']=li.xpath('.//span/text()').extract_first()
item['laiyuan']='太原市科学技术局'
#测试用的
#item['lianjie']='http://kjj.taiyuan.gov.cn/doc/2018/04/16/390076.shtml'
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
nums=response.xpath('//*[@id="pages"]').re('{.*?pageIndex\":.*?pageCount\":.*?pageSize\":.*?}')[0]
nums=eval(nums)
pageIndex=int(nums['pageIndex'])
pageCount=int(nums['pageCount'])
next_page=pageIndex+1
# if curry_page < count_page:
# urls = 'https://www.mct.gov.cn/whzx/ggtz/index_' + str(curry_page) + '.htm'
# yield scrapy.Request(url=urls, callback=self.parse)
if next_page <= pageCount:
yield scrapy.Request(url='http://kjj.taiyuan.gov.cn/zfxxgk/gggs/index_'+str(next_page)+'.shtml', callback=self.parse)
else:
logging.info('全部爬完了')
except Exception as e:
logging(e)
logging.info('因为异常:全部爬取完毕')
else:
logging.info('全部爬取完毕')
def page_url(self,response):
item=response.meta['item']
item['xiangqing']=response.xpath('//*[@id="Zoom"]').extract_first()
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass

86
demo1/spiders/wenhuahelvyoubuSpider.py

@ -0,0 +1,86 @@
import scrapy
from uuid import uuid4
import re
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
class wenhuahelvyoubuSpider(scrapy.Spider,Util_WANG):
name='wenhuahelvyoubuSpider'
settings = get_project_settings()
allowed_domains = ['mct.gov.cn']
custom_settings = custom_settings_conf_wenhuahelvyoubu
start_urls=['https://www.mct.gov.cn/whzx/ggtz/index.htm']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
trs=response.css('table tr')
for tr in trs:
item=Shouyelianjie()
item['lianjie']=tr.css('td a::attr(href)').extract_first()
item['biaoti']=tr.css('td a::text').extract_first()
item['shijian']=tr.css('td[class$="time"]::text').extract_first()
item['laiyuan']='文化和旅游部'
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
if Util_WANG.pos_url(item,self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
next_page = ts=response.xpath('//body//*[contains(text(),"createPageHTML")]').re('\d{1,2}.*?,.*?\d{1,2}')[0].split(',')
count_page = int(next_page[0].strip())
curry_page = int(next_page[-1].strip()) + 1
if curry_page < count_page:
urls = 'https://www.mct.gov.cn/whzx/ggtz/index_' + str(curry_page) + '.htm'
yield scrapy.Request(url=urls, callback=self.parse)
except Exception as e:
logging(e)
logging.info('因为异常:全部爬取完毕')
else:
logging.info('全部爬取完毕')
# 测试页面的直接
# urls='http://zwgk.mcprc.gov.cn/auto255/201612/t20161206_30535.html'
# yield scrapy.Request(url=urls,callback=self.page_url,meta={'item':Shouyelianjie()})
def page_url(self,response):
item=response.meta['item']
item['xiangqing']=response.css('#ContentRegion').extract_first()
if item['xiangqing'] is None:
item['xiangqing']=response.css('#zoom').extract_first()
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass

75
demo1/spiders/zhongxiaoqiyejuSpider.py

@ -0,0 +1,75 @@
import scrapy
from uuid import uuid4
import re
from scrapy.spiders import CrawlSpider
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
import pymysql
import logging
import json
import time
from urllib import parse
from demo1.Util import Util_WANG
class zhongxiaoqiyejuSpider(scrapy.Spider,Util_WANG):
name = 'zhongxiaoqiyejuSpider'
settings = get_project_settings()
allowed_domains = ['miit.gov.cn']
custom_settings = custom_settings_conf_zhongxiaoqiyezongju
start_urls=['http://www.miit.gov.cn/n1146285/n1146352/n3054355/n3057527/n3057529/index.html']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
lis=response.xpath('//*[@class="clist_con"]//li')
for li in lis:
item=Shouyelianjie()
item['biaoti']=li.xpath('./a/text()').extract_first()
item['lianjie'] =response.urljoin(li.xpath('./a/@href').extract_first())
item['laiyuan']='中小企业局'
item['shijian']=li.xpath('.//span//a/text()').extract_first()
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
yield scrapy.Request(url=item['lianjie'],meta={'item':item},callback=self.parse_item)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
yield scrapy.Request(url=item['lianjie'],meta={'item':item},callback=self.parse_item)
if self.settings.get("ISQUANPA"):
hrefs = response.xpath('//a[contains(@href,"index_") and contains(@href,"../") and contains(@href,".html")]/@href').extract()
for href in hrefs:
new_href=response.urljoin(href)
yield scrapy.Request(new_href,callback=self.parse_url)
def parse_url(self,response):
lis=response.xpath('//*[@class="clist_con"]//li')
for li in lis:
item=Shouyelianjie()
item['shijian']=li.xpath('./span/a/text()').extract_first()
item['laiyuan']='中小企业局'
item['lianjie']=response.urljoin(li.xpath('./a/@href').extract_first())
item['biaoti']=li.xpath('./a/text()').extract_first()
yield scrapy.Request(url=item['lianjie'],callback=self.parse_item,meta={'item':item})
def parse_item(self,response):
item=response.meta['item']
item['xiangqing']=response.xpath('//*[contains(@id,"con_con") and contains(@class,"center")]').extract_first()
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass

73
demo1/spiders/ziranweiyuanhuiSpider.py

@ -0,0 +1,73 @@
import scrapy
from uuid import uuid4
import re
from scrapy.spiders import CrawlSpider
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
class ziranweiyuanhuiSpider(scrapy.Spider,Util_WANG):
name = 'ziranweiyuanhuiSpider'
settings = get_project_settings()
allowed_domains = ['nsfc.gov.cn']
custom_settings = custom_settings_conf_ziranweiyuanhui
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def start_requests(self):
yield scrapy.Request(url=self.return_start_url(),callback=self.parse)
#ceshixianye
#yield scrapy.Request(url='http://www.nsfc.gov.cn/publish/portal0/tab442/info76617.htm',callback=self.page_url,meta={'item':Shouyelianjie()})
def return_start_url(self,page=1):
return 'http://www.nsfc.gov.cn/publish/portal0/tab442/module1178/page'+str(page)+'.htm'
def parse(self, response):
news=response.xpath('//*[@class="clearfix"]')
for news_list in news:
item=Shouyelianjie()
item['lianjie']='http://www.nsfc.gov.cn'+news_list.xpath('.//*[@class="fl"]/a/@href').extract_first()
item['laiyuan']='国家自然科学基金委员会'
item['shijian']=news_list.xpath('.//*[@class="fr"]/text()').extract_first()
item['biaoti']=news_list.xpath('.//a/text()').extract_first()
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
yield scrapy.Request(url=item['lianjie'],meta={'item':item},callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
next_page='http://www.nsfc.gov.cn'+response.xpath('//a[@class="Normal"]')[-2].xpath('@href').extract_first()
yield scrapy.Request(url=next_page,callback=self.parse)
except:
logging.info('全部爬取完毕')
def page_url(self,response):
item=response.meta['item']
txt=response.xpath('//*[@class="content_xilan"]').extract_first()
item['xiangqing']=txt.replace('\u3000','')
self.tihuan_a_return(item,self.settings.get('FILE_PATH'))
self.tihuan_img_return(item,self.settings.get('MESSAGE'))
yield item
def a_fun(self,href):
return 'http://www.nsfc.gov.cn'+href
def img_fun(self, src):
return 'http://www.nsfc.gov.cn'+src

BIN
noall_demo1.egg

Binary file not shown.

3
scrapy.cfg

@ -1,7 +1,7 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
#
[settings]
default = demo1.settings
@ -9,3 +9,4 @@ default = demo1.settings
[deploy:demo1]
url = http://49.232.6.143:6800/
project = chacewang
#https://scrapyd.readthedocs.io/en/latest/deploy.html
Loading…
Cancel
Save