简易抓取2018国家统计局的省市信息并导出到csv文件中(php)
简易抓取2018国家统计局的省市信息
注:代码异常处理有待完善,不喜勿喷。感谢
header("Content-Type: text/html;charset=UTF-8"); // 超时设置 ini_set('max_execution_time', '0'); //抓取地址 $url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/'; $data = curlGet($url); $data = iconv("GBK", "UTF-8//IGNORE",$data); preg_match_all('/provincetr\'>(.*?)<\/tr>/', $data, $matches); if (empty($matches)) { return '匹配异常'; } $data2show = returnArr($matches[1]); //得到省份信息 foreach ($data2show as $key => $val) { preg_match('/=\'(\d{2}).html/', $val, $sz); preg_match('/\'>(.{1,30})<br/', $val, $hz); $provinceArr[$sz[1]] = $hz[1]; } //获取城市信息 // print_r($provinceArr);exit; foreach ($provinceArr as $key => $val) { //拼凑城市信息请求地址 $cityUrl = $url.$key.'.html'; $data = curlGet($cityUrl); $data = iconv("GBK", "UTF-8//IGNORE",$data); preg_match_all('/citytr\'>(.*?)<\/tr>/', $data, $matches); foreach($matches[1] as $k => $v){ // echo $v;exit; preg_match_all('/=\'(\d{2})\/(\d{4}).html\'>(.*?)<\/a>/', $v, $info); $city[$key][$k]['province_code'] = $info[1][1]; $city[$key][$k]['province_name'] = $val; $city[$key][$k]['city_code'] = $info[2][1]; $city[$key][$k]['city_name'] = ($info[3][1] === '市辖区') ? $val : $info[3][1]; // print_r($city);exit; } // $cityData[$key] = $matches[1]; } $cityArr = arr2ToArr1($city); export_csv($cityArr);exit; //curl get请求 function curlGet($url) { $curl = curl_init(); //设置抓取的url curl_setopt($curl, CURLOPT_URL, $url); //设置头文件的信息作为数据流输出 // curl_setopt($curl, CURLOPT_HEADER, 1); //设置获取的信息以文件流的形式返回,而不是直接输出。 curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); //执行命令 $data = curl_exec($curl); //关闭URL请求 curl_close($curl); //显示获得的数据 return $data; } //传入内容,返回数组 function returnArr($content) { foreach ($content as $key => $val) { $arr[$key] = explode('<td>', '<' . trim($val, '<td>')); } $data2show = arr2ToArr1($arr); return $data2show; } //二维数组转化成一维数组 function arr2ToArr1($arr) { return array_reduce($arr, 'array_merge', array()); } //数据导出到csv function export_csv($data) { $path = $_SERVER['DOCUMENT_ROOT']."/csv/".date("Y-m-d",time())."/"; if (!is_dir($path)){ //判断目录是否存在 不存在就创建 mkdir($path,0777,true); } $filename = $path.time().'.csv'; //设置文件名 header( "Content-Type: text/csv;charset=utf-8" ); header( "Content-Disposition: attachment;filename=\"$filename\"" ); header("Pragma: no-cache"); header("Expires: 0"); $fp= fopen($filename, 'w'); // 对于用 wps 和编辑器打开无乱码但是用 excel 打开出现乱码的问题,可以添加以下一行代码解决问题 fwrite($fp, chr(0xEF).chr(0xBB).chr(0xBF)); foreach ($data as $fields) { fputcsv($fp, $fields); } fclose($fp); }
相关推荐
83911535 2020-11-13
曾是土木人 2020-10-31
yegen00 2020-10-21
soralaro 2020-10-11
katanaFlower 2020-09-18
wytzsjzly 2020-08-17
88407710 2020-08-17
ChinaJoeEE 2020-08-16
CyborgLin 2020-08-15
Blueberry 2020-08-15
PinkBean 2020-08-11
katanaFlower 2020-08-03
hunningtu 2020-07-30
阿债的方寸天地 2020-06-28
pingyan 2020-06-25
wytzsjzly 2020-06-25
阳光岛主 2020-06-25
阿债的方寸天地 2020-06-16