No framework for naive Bayes two classification based on python hand code

Posted by rr1024 on Sat, 26 Oct 2019 00:00:22 +0200

Data set: https://archive.ics.uci.edu/ml/datasets/Adult

This data set is character data set. You need to clean the data and transform the data by yourself. Fortunately, string operation in python is very simple and easy to handle. Characteristics of the degree, the nature of the job, work units and so on.

Naive Bayes: https://www.bilibili.com/video/av36338359?from=search&seid=1177086802297258225

 

This data set is used to predict whether a person's annual salary can exceed $50k / year.

Please refer to the following website for specific data characteristics:

Because it's 2-classification, the idea is very simple. In fact, we only need to find out one kind of probability.

The ConvertFeature in the following code is used to convert character type data to number type data, but I directly use the number type in the main function, so it is not used in practice.

Note that you need to change the path of the dataset.

import torch
import torch.nn.modules
import torch.nn
import numpy as np
from torch.autograd import Variable #Basic variables of torch
import torch.nn.functional as F #There's a lot of torch functions in it.
import matplotlib.pyplot as plt


class solution:
    def __init__(self):
        self.RawData=[]
        self.Data=[]
        self.GetClass={}

        self.NUMOFFEATURE=0
        self.NUMOFDATA=0
    def LoadData(self,PATH):
        with open(PATH,'r') as f:
            for line in f:
                self.RawData.append([thing.strip() for thing in line.split(',')])
        self.RawData=self.RawData[:-1]
        self.NUMOFDATA=len(self.RawData)

    def ShowRawData(self):
        for item in self.RawData:
            print(item)

    def WashData(self):
        attribute=len(self.RawData[0])-1
        #print("attribute: {}".format(attribute))

        index=0
        while index<attribute+1:
            #print('index: {}'.format(index))
            cnt=0
            temp=[i[index] for i in self.RawData]

            for item in temp:
                if item not in self.GetClass:
                    self.GetClass[item]=cnt
                    cnt+=1
            index+=1
        self.NUMOFFEATURE=index

        #print("feature number is {}.".format(self.NUMOFFEATURE))

        for line in self.RawData:
            self.Data.append([self.GetClass[i] for i in line])

    def ShowData(self):
        for item in self.Data:
            print(item)

    def BayesPredict(self,feature):
        #use=self.ConvertFeature(feature)
        #if len(use)==0: return "Feature Convert Failed. Unable to Predict"
        #code below is only for binary classification

        half0=[item for item in self.Data if item[self.NUMOFFEATURE-1]==1]
        #In class 0, find the event matching the Feature and record the number of events in cnt.
        cnt=[0]*len(half0)
        for item in half0:
            for index,data in enumerate(item):
                if data==feature[index]:
                    cnt[index]+=1
        P = 1
        PA=len(half0)#Number of occurrences of category 0 in the total category
        #Feature event multiplication
        for count in cnt:
            P =  P*(count/PA)
        #Multiply classification events
        P = P * (PA/self.NUMOFDATA)
        #Because it's a 2-class, either a 0-class or a 1-class, it greatly simplifies the calculation of Lang, and only needs to calculate the probability of a 1-class.
        if P > 0.5:
            return "No"
        else:
            return "Yes"
        #code ends here.

        '''
        code below is for generalized situation.
        lots of bugs

        half1=[]
        half0=[]
        for item in self.Data:
            if item[self.NUMOFFEATURE-1]==1:
                half1.append(item)
            else:half0.append(item)
        data=[half0,half1]
        PA=[len(half0),self.NUMOFDATA-len(half0)]

        PBAi=[[0]*self.NUMOFFEATURE]*2
        for index,half in enumerate(data):
            cnt = [0] * len(half)
            for item in half:
                for i in range(len(item)-1):
                    if item[i]==feature[i]:
                        cnt[i]+=1
            for i in range(len(PBAi[index])):
                PBAi[index][i]=cnt[i]/PA[index]
        '''



    def ConvertFeature(self,feature):
        res=[]
        for item in feature:
            print(item)
            if item in self.GetClass:
                res.append(self.GetClass[item])
            else:return []
        return res








if __name__=="__main__":
    MySolution=solution()
    MySolution.LoadData(r'C:\Users\Lenovo\Desktop\adult.data')
    #MySolution.ShowRawData()
    MySolution.WashData()
    #MySolution.ShowData()

    #naive bayes method classification
    feature=[50,2,10132,1,1,2,2,3,0,1,1,1,13,0,0]
    print(MySolution.BayesPredict(feature))







 

Topics: Attribute Python