Batch compression of download files in [python multithreading]

Posted by habs20 on Fri, 20 Dec 2019 16:31:09 +0100

The main functions are as follows:

1. Call api interface to get json data; 2. Convert json data to a csv file; 3. Convert each line of data in the csv file to a single xml file; 4. Package every 5 XML files

The modules used are:

       csv,xml,threading,tarfile,queue

Daemons

       

memorandum:

Multi thread communication can use a safe queue

wait and set between threads

Use the loop to wait for notification, jump out of the loop, the thread is the guardian thread, give a flag, and then break

'''
Created on 2018 October 4, 2010

@author: Xu Liangjun


****All threads need to be handed over to cup Random execution means that all threads are started
//Inter thread communication or inter thread notification requires human control****

//Design module:
    csv
    xml.etree.ElementTree
    threading
    queue
    


'''
import requests,logging,csv,time
from functools import wraps
from threading import Thread,Event
from xml.etree.ElementTree import ElementTree,tostring,Element

from queue import Queue

import tarfile
import os

class DownloadThread(Thread):
    '''Download thread
    IO Intensive operation
    
    
    "http://hq.sinajs.cn/list=%s "the data format obtained is:
    var hq_str_sh601015="Shaanxi black cat,6.400,6.420,6.440,6.470,6.390,6.440,6.450,3377008,21697080.000,53120,6.440,91300,6.430,92750,6.420,131000,6.410,131800,6.400,49600,6.450,25200,6.460,15700,6.470,11700,6.480,14200,6.490,2018-09-28,15:00:00,00";
    '''
    def __init__(self,sid,queue):
        Thread.__init__(self)
        self.sid=sid
        self.url="http://hq.sinajs.cn/list=%s" % sid
        self.queue = queue
        

    def download(self,url):
        response=requests.get(url,timeout=3000)
        response.encoding="GBK"
        if response.ok:
            content=response.text 
            maincontent_list = content[content.find("\"")+1:content.rfind("\"")].split(",")
#             info = "Stock Name:% s; today's opening price:% s; yesterday's closing price:% s; Date:% s"% (maincontent [0], maincontent [u list [1], maincontent [2], maincontent [30] + "" + maincontent [31],)
#             logging.warn(info)
        return maincontent_list
    
    
    def run(self):
        data=self.download(self.url)
        self.queue.put((self.sid,data))
        
        
        
        
        
        
class ConvertCSVThread(Thread):

    def __init__(self,queue,csvEvent):
        Thread.__init__(self)     
        self.queue=queue   
        self.csvEvent=csvEvent
        
    def dataToCsv(self,writer):
        
        while True:
            csvdata = []
            sid,data=self.queue.get()
            if sid == -1:
                break
            if data and data[0] != '' :
                csvdata.append((data[0],data[1],data[2],data[30],))
                writer.writerows(csvdata)    
        
        
    def run(self):
        
        csvFile = open( "fundnav.csv" , "w",encoding='utf-8-sig')
        writer = csv.writer(csvFile,lineterminator='\n')
        csvdata = []
        csvdata.append(("Fund name","Opening price","Closing price","date",))
        writer.writerows(csvdata)    
        
        self.dataToCsv(writer)
        csvFile.close()
         
        self.csvEvent.set() #Tell other threads that the file has been created and you can do something
        

class ConvertXMLThread(Thread):
    ''' 
    //Turn each line of csv file into a single xml  
        
    '''
    
    
    def __init__(self,cEvent,tEvent,csvEvent):
        Thread.__init__(self)     
        self.cEvent=cEvent
        self.tEvent=tEvent  
        self.csvEvent=csvEvent  
            
    def csvToXml(self,scsv):
        with open(scsv,'r',encoding='utf-8-sig') as f:
            reader=csv.reader(f)
            headers=next(reader)
            headers=list(map(lambda h:h.replace(' ',''),headers))
            
            
            readerlist=list(reader)
            
            index=0  #Record 5 groups
            sycount=len(readerlist) #Total
            itindex=0  #seek
            
            
            for row in readerlist:
                index+=1
                itindex+=1
                
                root = Element('Data')
                eRow = Element('Row')
                for tag,text in zip(headers,row):
                    e=Element(tag)
                    e.text=text
                    eRow.append(e)
                root.append(eRow)
                
#                 print(tostring(root,encoding='unicode',method ='xml' ) )
                 
                et=ElementTree(root)
                et.write("tarxml/%s.xml" % index,"utf-8")
                
                
                if(sycount-itindex) < 5 and sycount==itindex: #Handle the last less than 5
                    self.cEvent.set() #Notify other threads. After the conversion, you can do something
                    self.tEvent.wait() #block
                    break
                elif index == 5: #Pack every five
                    self.cEvent.set()   #Notify other threads. After the conversion, you can do something
                    self.tEvent.wait() #block  
                    self.tEvent.clear()
                    index = 0
        
    def run(self):
        while True:
            self.csvEvent.wait() 
            self.csvToXml("fundnav.csv")
            break
 

class TARThread(Thread):
    '''This thread is a guardian thread. The main thread ends, and this thread ends'''
    def __init__(self,cEvent,tEvent):
        Thread.__init__(self)
        self.count=0 
        self.cEvent=cEvent
        self.tEvent=tEvent  
        self.setDaemon(True) #Guardian thread, main thread ends, this thread ends
            
    def tarXML(self):
        self.count+=1
        tfname='%d.tgz' % self.count
        tf = tarfile.open( tfname ,'w:gz')
        for fname in os.listdir('./tarxml'):
            if fname.endswith('.xml'):
                tf.add('./tarxml/%s' % fname)
                os.remove('./tarxml/%s' % fname)
        tf.close()
         
        if not tf.members:
            os.remove(tfname) 
        
    def run(self):
        while True:
            self.cEvent.wait() #Blocking, no further execution, waiting for notification (set)
            self.tarXML() 
            self.cEvent.clear() 
            
            self.tEvent.set() 
 

  

def handle():
    '''test
    
    //Multiple producers: IO producers
//A consumer: CPU intensive consumers, as long as queque has data to process 
//Threads interact with each other through the queque object
    
    '''      
    start=time.time()       
    
    queue=Queue()
    codes=['sh601006','sh601005','sh601003','sh601002','sh601001','sh601007','sh601008',
           'sh601009','sh601010','sh601011','sh601012','sh601013','sh601014','sh601015','sh601016','sh601017',
           'sh601018','sh601019','sh601020','sh601021','sh601022','sh601023','sh601024','sh601025','sh601026']       
    
    csvEvent=Event()    #Control the conversion of csv files into multiple xml files, and ensure the existence of csv files 
    ct=ConvertCSVThread(queue,csvEvent)    
    dts=[DownloadThread(codes[i],queue) for i in range(len(codes))]
     
    #The thread starts and waits for the cup call 
    for t in dts:
        t.start()
    ct.start()
    
    cEvent=Event()
    tEvent=Event()
    cxml=ConvertXMLThread(cEvent,tEvent,csvEvent)
    tart=TARThread(cEvent,tEvent)   
    cxml.start()     
    tart.start()
     
    #The main thread can only be executed after the execution of t thread 
    for t in dts:
        t.join()
      
    queue.put((-1,None)) 
     
    #The main thread can only be executed after the execution of ct thread 
    ct.join() 
     
    print("time consuming %s" % (time.time() - start) )



handle()



Topics: xml encoding JSON Lambda