分享下页面关键字抓取components.arrow.com站点代码
代码如下:
<?php /** * HOST: components.arrow.com */ //set_time_limit(0); // base function function curl_get($url, $data = array(), $header = array(), $timeout = 15, $port = 80, $reffer = '', $proxy = '') { $ch = curl_init(); if (!empty($data)) { $data = is_array($data)?http_build_query($data): $data; $url .= (strpos($url,'?')? '&': "?") . $data; } curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch, CURLOPT_POST, 0); curl_setopt($ch, CURLOPT_PORT, $port); curl_setopt($ch, CURLOPT_HTTPHEADER, $header); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //是否抓取跳转后的页面 $reffer && curl_setopt($ch, CURLOPT_REFERER, $reffer); if($proxy) { curl_setopt($ch, CURLOPT_PROXY, $proxy); curl_setopt($ch, CURLOPT_PROXYPORT, 1723); curl_setopt($ch, CURLOPT_PROXYUSERPWD,"andhm001:andhm123"); } $result = array(); $result['result'] = curl_exec($ch); if (0 != curl_errno($ch)) { $result['error'] = "Error:\n" . curl_error($ch); } curl_close($ch); return $result; }
代码如下:
function curl_post($url, $data = array(), $header = array(), $timeout = 15, $port = 80) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch, CURLOPT_PORT, $port); !empty ($header) && curl_setopt($ch, CURLOPT_HTTPHEADER, $header); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $data); $result = array(); $result['result'] = curl_exec($ch); if (0 != curl_errno($ch)) { $result['error'] = "Error:\n" . curl_error($ch); } curl_close($ch); return $result; } /** * 获取列表页的html源码 * @param string $keywords 搜索关键字 * @param int $start 开始记录数 * @return boolean|array */ function getListHtml($keywords, $start = 0) { if ($start < 0) { return false; } $postData = array( 'search_token' => $keywords, 'start' => $start, 'limit' => 100, ); $result = curl_post('http://components.arrow.com/part/search/' . $keywords, http_build_query($postData)); if ( isset($result['error']) ) { return false; //exit($result['error']); } $result = $result['result']; return $result; } /** * 获取列表页 连接href * @param string $html html源码 * @return array */ function getListHref($html) { $pattern = '/<td\s+class="col_mfr_part_num"><a\s+href="(.[^>]+)">/isU'; if (preg_match_all($pattern, $html, $matches)) { return $matches[1]; } else { // 没有匹配项 return array(); } } /** * 获取下一页数字start * @param string $html html源码 * @return number */ function getListNextPage($html) { $pattern = '/<script\s+language="javascript">buildPagination\(\'\d+\',\'\d+\',\'(\d+)\',\d+\);<\/script>/isU'; if (preg_match($pattern, $html, $matches)) { return intval($matches[1]); } else { return -1; } } /** * 获取列表也所有的详细列表 * @param string $keywords 搜索关键字 * @return boolean|array */ function getListHrefAll($keywords) { if (empty($keywords)) { return false; } $html = getListHtml($keywords); $hrefList = getListHref($html); if (empty($hrefList)) { // 没有结果 return array(); } $nextPage = getListNextPage($html); $loop =0; while ($nextPage > 0) { $html = getListHtml($keywords, $nextPage); $tmpHrefList = getListHref($html); $hrefList = array_merge($hrefList, $tmpHrefList); $nextPage = getListNextPage($html); $loop ++; } return $hrefList; } /** * 获取详情页信息 * @param string $url url地址 * @return array() */ function getDetail($url) { if ( empty($url) ) { return false; } $host = 'http://components.arrow.com'; $url = $host . $url; $result = curl_get($url); if ( isset($result['error']) ) { return array(); //exit($result['error']); } $html = $result['result']; $result = array( 'sup_part' => '', // 供应商型 'sup_id' => '', // 供应商ID 'mfg_part' => '', // 制造商型号 'mfg_name' => '', // 制造商名称 'cat_name' => '', // 分类名称 'para' => '', // 属性 'desc' => '', // 描述 'pdf_url' => '', // PDF地址 'sup_stock' => '', // 库存 'min_purch' => '', // 最小订购量 'price' => '', // 价格 'img_url' => '', // 图片地址 'createtime' => '', // 创建时间 'datacode' => '', // 批号 'package' => '', // 封装 'page_url' => '', // 页面地址 ); // mfg_part $pattern = '/<li>[\s\n]*<strong>Part No:\s*<\/strong>(.+)<\/li>/isU'; if (preg_match($pattern, $html, $matches)) { $result['mfg_part'] = trim($matches[1]); } else {file_put_contents('page.txt', $html);die('xxx'); return array(); } // mfg_name $pattern = '/<li>[\s\n]*<strong>Manufacturer: <\/strong>(.+)<\/li>/isU'; if (preg_match($pattern, $html, $matches)) { $result['mfg_name'] = trim($matches[1]); } // cat_name $pattern = '/displayCategory\(\'(.[^\']+)\'\);/isU'; if (preg_match($pattern, $html, $matches)) { $result['cat_name'] = trim($matches[1]); $result['cat_name'] = str_replace('|', '>', $result['cat_name']); } // para $tablepattern = '/<table\s+id="part_specs".[^>]*>(.+)<\/table>/isU'; if (preg_match($tablepattern, $html, $matches)) { $pattern = '/<tr>[\s\n]*<td><strong>(.+)<\/strong><\/td><td>(.+)<\/td>[\s\n]*<\/tr>/isU'; if (preg_match_all($pattern, $matches[1], $matches)) { foreach($matches[1] as $k=>$v) { $v = trim($v); if ('Package Type' == $v) { $result['package'] = trim($matches[2][$k]); continue; } $result['para'][$v] = trim($matches[2][$k]); } } } // desc $pattern = '/<div\s+id="part_title">.+<h4>(.+)<\/h4>[\s\n]*<\/div>/isU'; if (preg_match($pattern, $html, $matches)) { $result['desc'] = trim($matches[1]); } // pdf_url $pattern = '/<li\s+class="datasheet">[\s\n]*<strong>Datasheet:<\/strong><a\s+href="(.[^"]+)"/isU'; if (preg_match($pattern, $html, $matches)) { $result['pdf_url'] = $host . trim($matches[1]); } // sup_stock $pattern = '/<td\s+id="inv_1"\s+class="li_inv">([\d,]+)<\/td>/isU'; if (preg_match($pattern, $html, $matches)) { $result['sup_stock'] = trim($matches[1]); $result['sup_stock'] = str_replace(',', '', $result['sup_stock']); } // min_purch $pattern = '/<span\s+id="multiples">[\s\n]*<strong>Multiple:\s*<\/strong>(.+)<\/span>/isU'; if (preg_match($pattern, $html, $matches)) { $result['min_purch'] = trim($matches[1]); } // price $pattern = '/<div\s+id="price_1"\s+class="li_price">(.[^<]+)<\/div>/isU'; if (preg_match($pattern, $html, $matches)) { $result['price'][1] = trim($matches[1]); } $pattern = '/<div\s+id="price_1"\s+class="li_price">[\s\n]*<span.[^>]+title="(.[^"]+)">/isU'; if (preg_match($pattern, $html, $matches)) { $priceurl = str_replace('&', '&', $matches[1]); $json = curl_get($priceurl); $json = $json['result']; if (! empty($json)) { $jsonresult = json_decode($json, true); foreach ($jsonresult['parts'][0]['webprice']['resale'] as $k=>$v) { $result['price'][$v['minqty']] = $v['price']; } } } // img_url $pattern = '/<div\s+id="part_image">[\s\n]*<img\s+src="(.[^"]+)"/isU'; if (preg_match($pattern, $html, $matches)) { $result['img_url'] = trim($matches[1]); } // page_url $result['page_url'] = $url; return $result; } /** * 最终调用函数 * @param string $keywords 搜索关键字 * @return array */ function getData($keywords) { $hrefList = getListHrefAll($keywords); $result = array(); foreach ($hrefList as $k=>$v) { $result[] = getDetail($v); } return $result; } // Test Script $keywords = trim($_GET['keywords']); $result = getData($keywords); print_r($result);
相关推荐
83911535 2020-11-13
曾是土木人 2020-10-31
yegen00 2020-10-21
soralaro 2020-10-11
katanaFlower 2020-09-18
wytzsjzly 2020-08-17
88407710 2020-08-17
ChinaJoeEE 2020-08-16
CyborgLin 2020-08-15
Blueberry 2020-08-15
PinkBean 2020-08-11
katanaFlower 2020-08-03
hunningtu 2020-07-30
阿债的方寸天地 2020-06-28
pingyan 2020-06-25
wytzsjzly 2020-06-25
阳光岛主 2020-06-25
阿债的方寸天地 2020-06-16