Collect a lot of anti - Crawler sites, there is a very good tool: Octopus... Directly through the browser access, a force to break ten thousand methods
Preparatory work: money, spend money to buy members of Octopus website for a month or more, so that you can do Cloud Collection
Next, grab the data by yourself. After the grab, set the timing cloud acquisition
Then you click cloud collection to view the data, and there will be a lot of data. Here is how to get the data locally, and then clear it regularly
Upper Code:
Directly through get_group_list to get your group id, and then get the group id you need_ task_ Run in the list, get all the task lists, so you can cycle the task list, get the data in it, and after running, perform the api clearing operation, and then it's finished
class NewsData extends Frontend{ //remarks //Octopus interface document //https://dataapi.bazhuayu.com/help#_ref_status_code public function ins_list() { $task_list=$this->get_task_list(); if($task_list==false){ echo "Task list is empty"; die(); } $NewsSource=new NewsSource(); foreach ($task_list as $k=>$v){ $task_id=$v['taskId'];//Task id //Call task related data details according to task ID echo "task{$v['taskName']}:{$task_id}start: "; sleep(1); //Cycle to get data $data=$this->get_task_data($task_id); $ins=[]; if(empty($data)){ echo "Data is empty:skip! "; continue; } //Loop insert data foreach ($data as $k2=>$item){ $ins=[ "cate_name"=>"Journalism", "search_name"=>$item['Search keywords'], "title"=>$item['Latest article title'], "keywords"=>"", "description"=>"", "thumb"=>"", "content"=>'', "create_time_text"=>$item['time'], "status"=>"1", "oldurl"=>$item['Links to the latest articles'], "author"=>$item['Name of official account'], "author_avatar"=>$item['Head portrait of the general manager'], "author_wechat"=>$item['Official account micro signal'], ]; if(strlen($item['content'])<10){ echo "Content is empty,skip! "; continue; } $ins['content']=$NewsSource->gz_str($item['content']); //Whether the query is duplicate $has_id=Db::name("news_source")->where(["title"=>$ins['title']])->value("id"); if(intval($has_id)!==0){ echo "repeat ID:{$has_id},title:{$ins['title']}--Duplicate title,skip! "; continue; } //Insert a piece of data $ins_id=Db::name("news_source")->insertGetId($ins); echo "Successfully inserted one id:{$ins_id} "; unset($ins); } unset($data); //Clear task data $this->remove_task_data($task_id); } echo "end of execution!"; die(); } public $url="https://dataapi.bazhuayu.com"; //Get token value public function get_token(){ $token_name="bazhuayu_token"; $token_data=Cache::get($token_name); //If the data is empty or the data expiration time is less than the current time (token expiration) if($token_data==false||$token_data['expires_time']<time()){ $url="https://dataapi.bazhuayu.com/token"; $params=[ "username"=>"xxxx", "password"=>"xxxx..", "grant_type"=>"password", ]; $params=http_build_query($params); $token_json=$this->sendPost($url,$params); $token_data=json_decode($token_json,true); if(!isset($token_data['expires_in'])){ echo $token_json; return false; } $token_data['expires_time']=$token_data['expires_in']+time(); Cache::set($token_name,$token_data,$token_data['expires_in']); return $token_data; } if(!empty($token_data)){ return $token_data; }else{ return false; } } //Get header encapsulation token to header public function get_header(){ $token_data=$this->get_token(); if($token_data==false){ echo "token error!"; die(); }else{ $header=[ "Authorization:".$token_data['token_type'].' '.$token_data["access_token"], ]; return $header; } } //Get task group list public function get_group_list(){ $url="https://dataapi.bazhuayu.com/api/TaskGroup"; $header=$this->get_header(); $list=$this->sendGet($url,$header); $list=json_decode($list,true); var_dump($list); } //Get task details list public function get_task_list(){ $group_id=2206006;//Define task group id $url=$this->url."/api/Task?taskGroupId=".$group_id; $header=$this->get_header(); $list=$this->sendGet($url,$header); $list=json_decode($list,true); $list=isset($list['data'])?$list['data']:false; return $list; } //Get task data public function get_task_data($task_id=null) { //$task_id="0385e312-b674-4793-9435-xxxx"; $offset=0; $size=200; //$url=$this->url."/api/alldata/GetDataOfTaskByOffset?taskId={$task_id}&offset={$offset}&size={$size}"; $url=$this->url."/api/notexportdata/gettop?taskId={$task_id}&size={$size}"; $header=$this->get_header(); $data=$this->sendGet($url,$header); $data=json_decode($data,true); if(isset($data['data']['dataList'])&&!empty($data['data']['dataList'])){ return $data['data']['dataList']; }else{ return false; } } //Clear task data public function remove_task_data($task_id=null) { //$task_id="02be503d-4568-430b-9f15-c3511ad8e98b"; $url=$this->url."/api/task/RemoveDataByTaskId?taskId=".$task_id; $header=$this->get_header(); $json=$this->sendPOst($url,null,$header); var_dump($json); echo "task ID{$task_id}Data destruction completed!"; } /** * Initiate a request * @param string $url Request address * @param string $data Request packet * @return string Request return data */ public function sendPost($url,$data,$header=null) { $curl = curl_init(); // Start a CURL session curl_setopt($curl, CURLOPT_URL, $url); // Address to access curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0); // Inspection on the source of certification certificate curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2); // Check if SSL encryption algorithm exists from certificate if(!empty($header)){ curl_setopt($curl, CURLOPT_HTTPHEADER, $header); curl_setopt($curl, CURLOPT_HEADER, 0);//Return response header information } curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); // Use auto jump curl_setopt($curl, CURLOPT_AUTOREFERER, 1); // Automatically set Referer curl_setopt($curl, CURLOPT_POST, 1); // Send a regular Post request curl_setopt($curl, CURLOPT_POSTFIELDS, $data); // Packets submitted by Post curl_setopt($curl, CURLOPT_TIMEOUT, 30); // Set timeout limit to prevent dead cycle curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); // The acquired information is returned as a file stream $return_data = curl_exec($curl); // Perform action if (curl_errno($curl)) { echo 'Errno'.curl_error($curl); } curl_close($curl); // Critical CURL session return $return_data; // Return data } public function sendGet($url,$header=null) { $curl = curl_init(); // Start a CURL session curl_setopt($curl, CURLOPT_URL, $url); // Address to access curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0); // Inspection on the source of certification certificate curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2); // Check if SSL encryption algorithm exists from certificate curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); // Use auto jump curl_setopt($curl, CURLOPT_AUTOREFERER, 1); // Automatically set Referer if(!empty($header)){ curl_setopt($curl, CURLOPT_HTTPHEADER, $header); curl_setopt($curl, CURLOPT_HEADER, 0);//Return response header information } curl_setopt($curl, CURLOPT_TIMEOUT, 20); // Set timeout limit to prevent dead cycle curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); // The acquired information is returned as a file stream $return_data = curl_exec($curl); // Perform action if (curl_errno($curl)) { echo 'Errno'.curl_error($curl); } curl_close($curl); // Critical CURL session return $return_data; // Return data } }