Collection artifact, Octopus Automatic Grab strategy and warehousing operation!

Posted by christillis on Tue, 02 Jun 2020 17:02:35 +0200

Collect a lot of anti - Crawler sites, there is a very good tool: Octopus... Directly through the browser access, a force to break ten thousand methods

Preparatory work: money, spend money to buy members of Octopus website for a month or more, so that you can do Cloud Collection

Next, grab the data by yourself. After the grab, set the timing cloud acquisition

Then you click cloud collection to view the data, and there will be a lot of data. Here is how to get the data locally, and then clear it regularly

Upper Code:

Directly through get_group_list to get your group id, and then get the group id you need_ task_ Run in the list, get all the task lists, so you can cycle the task list, get the data in it, and after running, perform the api clearing operation, and then it's finished


class NewsData extends Frontend{



    //remarks
    //Octopus interface document
    //https://dataapi.bazhuayu.com/help#_ref_status_code


    public function ins_list()
    {
        $task_list=$this->get_task_list();
        if($task_list==false){
            echo "Task list is empty";
            die();
        }
        $NewsSource=new NewsSource();
        foreach ($task_list as $k=>$v){
            $task_id=$v['taskId'];//Task id
            //Call task related data details according to task ID
            echo "task{$v['taskName']}:{$task_id}start:    ";
            sleep(1);

            //Cycle to get data
            $data=$this->get_task_data($task_id);
            $ins=[];
            if(empty($data)){
                echo "Data is empty:skip!      ";
                continue;
            }
            //Loop insert data
            foreach ($data as $k2=>$item){
                $ins=[
                    "cate_name"=>"Journalism",
                    "search_name"=>$item['Search keywords'],
                    "title"=>$item['Latest article title'],
                    "keywords"=>"",
                    "description"=>"",
                    "thumb"=>"",
                    "content"=>'',
                    "create_time_text"=>$item['time'],
                    "status"=>"1",
                    "oldurl"=>$item['Links to the latest articles'],
                    "author"=>$item['Name of official account'],
                    "author_avatar"=>$item['Head portrait of the general manager'],
                    "author_wechat"=>$item['Official account micro signal'],
                ];
                if(strlen($item['content'])<10){
                    echo "Content is empty,skip!   ";
                    continue;
                }

                $ins['content']=$NewsSource->gz_str($item['content']);

                //Whether the query is duplicate
                $has_id=Db::name("news_source")->where(["title"=>$ins['title']])->value("id");

                if(intval($has_id)!==0){

                    echo "repeat ID:{$has_id},title:{$ins['title']}--Duplicate title,skip!   ";
                    continue;
                }

                //Insert a piece of data
                $ins_id=Db::name("news_source")->insertGetId($ins);
                echo "Successfully inserted one id:{$ins_id}    ";
                unset($ins);
            }
            unset($data);

            //Clear task data
            $this->remove_task_data($task_id);
        }
        echo "end of execution!";
        die();
    }



    public $url="https://dataapi.bazhuayu.com";

    //Get token value
    public function get_token(){
        $token_name="bazhuayu_token";
        $token_data=Cache::get($token_name);
        //If the data is empty or the data expiration time is less than the current time (token expiration)
        if($token_data==false||$token_data['expires_time']<time()){
            $url="https://dataapi.bazhuayu.com/token";
            $params=[
                "username"=>"xxxx",
                "password"=>"xxxx..",
                "grant_type"=>"password",
            ];
            $params=http_build_query($params);
            $token_json=$this->sendPost($url,$params);
            $token_data=json_decode($token_json,true);
            if(!isset($token_data['expires_in'])){
                echo $token_json;
                return false;
            }
            $token_data['expires_time']=$token_data['expires_in']+time();
            Cache::set($token_name,$token_data,$token_data['expires_in']);
            return $token_data;
        }
        if(!empty($token_data)){
            return $token_data;
        }else{
            return false;
        }


    }

    //Get header encapsulation token to header
    public function get_header(){
        $token_data=$this->get_token();

        if($token_data==false){
            echo "token error!";
            die();
        }else{
            $header=[
                "Authorization:".$token_data['token_type'].' '.$token_data["access_token"],
            ];
            return $header;
        }

    }


    //Get task group list
    public function get_group_list(){
        $url="https://dataapi.bazhuayu.com/api/TaskGroup";
        $header=$this->get_header();
        $list=$this->sendGet($url,$header);
        $list=json_decode($list,true);
        var_dump($list);
    }

    //Get task details list
    public function get_task_list(){
        $group_id=2206006;//Define task group id
        $url=$this->url."/api/Task?taskGroupId=".$group_id;

        $header=$this->get_header();
        $list=$this->sendGet($url,$header);
        $list=json_decode($list,true);
        $list=isset($list['data'])?$list['data']:false;


        return $list;
    }

    //Get task data
    public function get_task_data($task_id=null)
    {
        //$task_id="0385e312-b674-4793-9435-xxxx";
        $offset=0;
        $size=200;
        //$url=$this->url."/api/alldata/GetDataOfTaskByOffset?taskId={$task_id}&offset={$offset}&size={$size}";
        $url=$this->url."/api/notexportdata/gettop?taskId={$task_id}&size={$size}";
        $header=$this->get_header();
        $data=$this->sendGet($url,$header);
        $data=json_decode($data,true);


        if(isset($data['data']['dataList'])&&!empty($data['data']['dataList'])){

            return $data['data']['dataList'];
        }else{
            return false;
        }
    }



    //Clear task data
    public function remove_task_data($task_id=null)
    {
        //$task_id="02be503d-4568-430b-9f15-c3511ad8e98b";
        $url=$this->url."/api/task/RemoveDataByTaskId?taskId=".$task_id;
        $header=$this->get_header();
        $json=$this->sendPOst($url,null,$header);
        var_dump($json);
        echo "task ID{$task_id}Data destruction completed!";

    }
    


    /**
     * Initiate a request
     * @param  string $url  Request address
     * @param  string $data Request packet
     * @return   string      Request return data
     */
    public function sendPost($url,$data,$header=null)
    {
        $curl = curl_init(); // Start a CURL session
        curl_setopt($curl, CURLOPT_URL, $url); // Address to access
        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0); // Inspection on the source of certification certificate
        curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2); // Check if SSL encryption algorithm exists from certificate
        if(!empty($header)){
            curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
            curl_setopt($curl, CURLOPT_HEADER, 0);//Return response header information
        }

        curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); // Use auto jump
        curl_setopt($curl, CURLOPT_AUTOREFERER, 1); // Automatically set Referer
        curl_setopt($curl, CURLOPT_POST, 1); // Send a regular Post request
        curl_setopt($curl, CURLOPT_POSTFIELDS, $data); // Packets submitted by Post
        curl_setopt($curl, CURLOPT_TIMEOUT, 30); // Set timeout limit to prevent dead cycle
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); // The acquired information is returned as a file stream

        $return_data = curl_exec($curl); // Perform action
        if (curl_errno($curl)) {
            echo 'Errno'.curl_error($curl);
        }
        curl_close($curl); // Critical CURL session
        return $return_data; // Return data
    }



    public function sendGet($url,$header=null)
    {
        $curl = curl_init(); // Start a CURL session
        curl_setopt($curl, CURLOPT_URL, $url); // Address to access
        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0); // Inspection on the source of certification certificate
        curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2); // Check if SSL encryption algorithm exists from certificate

        curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); // Use auto jump
        curl_setopt($curl, CURLOPT_AUTOREFERER, 1); // Automatically set Referer
        if(!empty($header)){
            curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
            curl_setopt($curl, CURLOPT_HEADER, 0);//Return response header information
        }

        curl_setopt($curl, CURLOPT_TIMEOUT, 20); // Set timeout limit to prevent dead cycle
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); // The acquired information is returned as a file stream

        $return_data = curl_exec($curl); // Perform action
        if (curl_errno($curl)) {
            echo 'Errno'.curl_error($curl);
        }
        curl_close($curl); // Critical CURL session
        return $return_data; // Return data
    }


}

Topics: Programming curl Session JSON SSL