Python_Lab_Record_final (1)
Python_Lab_Record_final (1)
a=[]
n=int(input("Enter number of elements :"))
for i in range (1,n+1):
b=int(input("Enter element :"))
a.append(b)
print("largest element is ",max(a))
a=[]
n=int(input("Enter number of elements "))
for i in range(0,n):
b=int(input("Enter element :"))
a.append(b)
a.sort()
a.reverse()
i=0
while a[i]==a[i+1]:
i+=1;
print(a[i+1])
#program to split even and odd elements into two lists
a=[]
n=int(input("Enter number of elements :"))
for i in range(n):
b=int(input("Enter Element :"))
a.append(b)
even=[]
odd=[]
for i in a:
if(i%2==0):
even.append(i)
else:
odd.append(i)
print("The even list is ",even)
print("The odd list is ",odd)
#program to find if a key exists in a dictionary or not
#linear search
def search(arr,N,x):
for i in range(0,N):
if arr[i] == x:
return i
return -1
#driver code
if __name__ == '__main__':
arr=[2,3,4,10,40]
x=10
N=len(arr)
result=search(arr,N,x)
if result ==-1:
print("element is not present")
else:
print("element is present at index",result)
#binary_search
def binary_search(V,To_Find):
lo=0
hi=len(V)-1
while hi-lo>1:
mid=(hi+lo)//2
if V[mid]<To_Find:
lo=mid+1
else:
hi=mid
if V[lo]==To_Find:
print("Found at the index",lo)
elif V[hi]==To_Find:
print("Found at index",hi)
else:
print("not found")
if __name__=='__main__':
V=[1,3,4,5,6]
To_Find=1
binary_search(V,To_Find)
To_Find=6
binary_search(V,To_Find)
To_Find=10
binary_search(V,To_Find)
#bubble sort
def bubbleSort(arr):
n = len(arr)
for i in range(n):
for j in range(0, n - i - 1):
if arr[j] > arr[j + 1]:
arr[j], arr[j + 1] = arr[j + 1], arr[j]
if __name__ == '__main__':
arr = [2, 1, 23, 10]
bubbleSort(arr)
print("Sorted array is :",arr)
#selection sort
def selectionsort(array,size):
for s in range(size):
min_idx=s
for i in range(s+1,size):
if array[i]<array[min_idx]:
min_idx=i
(array[s],array[min_idx])=(array[min_idx],array [s])
if __name__=='__main__':
data=[7,2,1,6]
size=len(data)
selectionsort(data,size)
print('sorted Array is ascending order is :')
print(data)
# Insertion Sort
def insertionSort(arr):
for i in range(1, len(arr)):
key = arr[i]
j = i-1
while j >=0 and key < arr[j] :
arr[j+1] = arr[j]
j -= 1
arr[j+1] = key
if __name__ == '__main__':
arr = [12, 11, 13, 5, 6]
insertionSort(arr)
print("The sorted array is :",arr)
#quick sort
def partition(array,low,high):
pivot=array[high]
i=low-1
for j in range(low,high):
if array[j]<=pivot:
i=i+1
(array[i],array[j])=(array[j],array[i])
(array[i+1],array[high])=(array[high],array[i+1])
return i+1
def quick_sort(array,low,high):
if low<high:
pi=partition(array,low,high)
quick_sort(array,low,pi-1)
quick_sort(array,pi+1,high)
if __name__=='__main__':
array=[10,7,8,9,1,5]
quick_sort(array,0,len(array)-1)
print(f'sorted array:{array}')
# merge sort
def mergesort(arr):
if len(arr)>1:
mid=len(arr)//2
l=arr[:mid]
r=arr[mid:]
mergesort(l)
mergesort(r)
i=j=k=0
while i<len(l) and j<len(r):
if l[i]<=r[j]:
arr[k]=l[i]
i+=1
else:
arr[k]=r[j]
j+=1
k+=1
while i<len(l):
arr[k]=l[i]
i+=1
k+=1
while j<len(r):
arr[k]=r[j]
j+=1
k+=1
def printlist(arr):
for i in range(len(arr)):
print(arr[i],end=" ")
print()
if __name__=='__main__':
arr=[12,11,10,3,5]
print("given array is",end="\n")
printlist(arr)
mergesort(arr)
print("sorted array is:",end="\n")
printlist(arr)
#heap sort
def heapify(arr,N,i):
largest=i
l=2*i+1
r=2*i+2
if l<N and arr[largest]<arr[l]:
largest=l
if r<N and arr[largest]<arr[r]:
largest=r
if largest !=i:
arr[i],arr[largest]=arr[largest],arr[i]
heapify(arr,N,largest)
def heapsort(arr):
N=len(arr)
for i in range(N//2-1,-1,-1):
heapify(arr,N,i)
for i in range(N-1,0,-1):
arr[i],arr[0]=arr[0],arr[i]
heapify(arr,i,0)
if __name__=='__main__':
arr=[10,12,3,5,6]
heapsort(arr)
N=len(arr)
print('sorted array is:')
for i in range(N):
print("%d"%arr[i],end=" ")
3. Python programs on text handling
a = "Hello world"
print(len(a)) #1. len() function returns the length of the string
print(a.count("l")) #2. count() method returns the number of times the specified value occurs in the
string
print(a.startswith("H")) #3. returns a boolean value if the string starts with the specified value
print(a.endswith("l")) #4. returns a boolean value if the string end with the specified value
print(a.find("l")) #5. returns the index value of the first occurrence of the specified value else
returns -1
print(a.rfind("l")) #6. returns the index value of the last occurrence of the specified value else
returns -1
print(a.isalnum()) #7. returns True if all the values in the string are alphanumeric
print(a.isdigit()) #8. returns True if all the values in the string are digits
print(a.isalpha()) #9. returns True if all the values in the string are alphabets
print(a.islower()) #10. returns True if all characters in the string are lower case
#checking in a string
#concatenation in strings
Concatenation in strings
a = "hello"
b = "world"
c=a+""+b
print(c)
#format strings
age = 18
# we cannot combine text and numbers like this
# txt = "My name is John, and my age is "+ age
# instead we use the format() method which takes the passed arguments, formats them,
# and places them in the string where the placeholders {} are:
txt = "My name is John, and I am {}"
print(txt.format(age))
# -------------------------------------------------------------------------
# format() method takes unlimited arguments and places it in respective placeholders{}
quantity = 3
itemno = 567
price = 49.95
myorder = "I want {} pieces of item {} for {} dollars."
print(myorder.format(quantity, itemno, price))
# -------------------------------------------------------------------------
# we can also use index numbers to be sure the arguments are placed in correct placeholders
myorder = "I want to pay {2} dollars for {0} pieces of item {1}."
print(myorder.format(quantity, itemno, price))
#looping in strings
a = "Hello"
for x in a:
print(x)
# another way of writing the above program...
for x in "Hello":
print(x)
# to avoid printing a new line everytime a print statement is called...
for x in a:
print(x, end="")
#modify strings
a = "hello world"
print(a.upper()) #1. the upper() method returns the string in upper case
# -------------------------------------------------------------------------
b = "HelLO woRlD"
print(b.lower()) #2. the lower() method returns the sting in lower case
# -------------------------------------------------------------------------
print(b.casefold()) #3. the casefold() method is similar to lower() but is stronger and more aggressive
# -------------------------------------------------------------------------
c = " Hello world! "
print(c.strip()) #4. the strip() method removes any leading and trailing characters passed as
arguments. Space is default leading and trailing character
# -------------------------------------------------------------------------
print(c.lstrip()) #5. the lstrip() method removes any leading characters passed as arguments. Space
is default leading character
# -------------------------------------------------------------------------
print(c.rstrip()) #6. the rstrip() method removes any trailing characters passed as arguments. Space
is default trailing character
# -------------------------------------------------------------------------
d = "Hello World"
print(d.replace("He", "J")) #7. the replace() method replaces a string with another string
# -------------------------------------------------------------------------
e = "hello, world"
print(e.split(",")) #8. the split() method returns a list where the text between the specified
separator become list items
# -------------------------------------------------------------------------
f = "hello world"
print(f.capitalize()) #9. Converts the first character to upper case
# -------------------------------------------------------------------------
print(f.title()) #10. the title method returns a string where the first character of every word is
uppercase
# -------------------------------------------------------------------------
g = "Hello My Name Is PETER"
print(g.swapcase()) #11. the swapcase method returns a string where all the uppercase are
lowercase and vice versa
#slicing a string
# ------------------------------------------------------------------------
# slice from the start until position 5 (not included)
print(string[:5])
# ------------------------------------------------------------------------
# slice until the end from position 2
print(string[2:])
# ------------------------------------------------------------------------
# using negative indexes to slice from position -5 (not included) to -2
print(string[-5:-2])
#align a string
txt = "hello"
# the center() method will align the string, using a specified character(space is default) as the fill
character
# syntax: string.center(length,character)...length=Required. The length of the returned string
# character=Optional. The character to fill the missing space on each side.
# Default is " "(space).
a = txt.center(20)
print(a)
# with fill character as "0"
a = txt.center(20, "0")
print(a)
print(len(a))
# -------------------------------------------------------------------------
# the ljust() method will left align the string, using a specified character as the fill character with
# the specified length
a = txt.ljust(20, "*")
print(a, "world")
# -------------------------------------------------------------------------
# the rjust() method will right align the string, using a specified character as the fill character with
# the specified length
a = txt.rjust(20, "*")
print(a)
# The partition() method searches for a specified string, and splits the string into a tuple containing
three elements.
# The first element contains the part before the specified string.
# The second element contains the specified string.
# The third element contains the part after the string
txt = "I could eat bananas all day"
x = txt.partition("bananas")
print(x)
# ------------------------------------------------------------------------
# If the specified value is not found, the partition() method returns a tuple containing: 1 - the whole
string, 2 - an empty string, 3 - an empty string
txt = "I could eat bananas all day"
x = txt.partition("apples")
print(x)
# ------------------------------------------------------------------------
# The rpartition() method searches for the last occurrence of a specified string, and splits the string
into a tuple containing three elements.
# The first element contains the part before the specified string.
# The second element contains the specified string.
# The third element contains the part after the string.
txt = "I could eat bananas all day, bananas are my favorite fruit"
x = txt.rpartition("bananas")
print(x)
# The maketrans() method returns a mapping table that can be used with the translate() method to
replace characters
txt = "Hello Sam!"
mytable = txt.maketrans("S", "P")
print(mytable) # prints a mapping table with ascii values
# ------------------------------------------------------------------------
# The translate() method returns a string where some specified characters are replaced with the
character described in the dictionary or in the mapping table
# If a character is not specified in the dictionary/table, the character will not be replaced.
# If you use a dictionary, you must use ascii codes instead of characters
print(txt.translate(mytable)) # replace any character 'S' with a 'P' using a mapping table
# or
mydict = {83: 80}
print(txt.translate(mydict)) # replace any character 'S' with a 'P' using a dictionary
# ------------------------------------------------------------------------
txt = "Good night Sam!"
x = "mSa"
y = "eJo"
z = "odnght"
mytable = txt.maketrans(x, y, z) # the third parameter z describes the characters that you want to
remove from the string
print(txt.translate(mytable))
file = open('abc.txt','r')
for each in file:
print(each)
file=open('abc.txt','r')
print(file.read()) #prints entire file
file.seek(0)
print(file.read(5)) #prints first 5 characters from the file
file = open('abc.txt','w')
file.write('This is the write command')
file.close() #frees the system by terminating the resources in use
# Program 4 :- Working of append() mode
file = open('abc.txt','a')
file.write('This will add this line')
file.close()
# with block
lst = [19,22,34,26,32,30,24,24]
def mean(dataset):
return sum(dataset)/len(dataset)
print(mean(lst))
# calculating median
ls2 = [187,187,196,196,198,203,207,211,215]
ls3 = [181,187,196,198,203,207,211,215]
def median(dataset):
data = sorted(dataset)
index = len(data)//2
if len (dataset)%2!=0
return data[index]
return(data[index-1]+data[index])/2
print(median(ls2))
print(median(ls3))
#Calculating mode
ls2 = [3,15,23,42,30,10,10,12]
ls3 = ['nike','adidas','nike','jordan','jordan','rebook','under_amour','adidas']
def mode(dataset):
frequency = {}
for value in dataset:
frequency[value] = frequency.get(value,0)+1
most_frequent = max(frequency.values())
modes = [key for key,value in frequency.items() if value == most_frequent]
return modes
print(mode(ls2))
print(mode(ls3))
# Program 6 :- Using mode() & multimode from statistics module
#calculating variance
import statistics
print(statistics.variance([1, 3, 5, 7, 9, 11]))
print(statistics.variance([2, 2.5, 1.25, 3.1, 1.75, 2.8]))
print(statistics.variance([-11, 5.5, -3.4, 7.1]))
print(statistics.variance([1, 30, 50, 100]))
# calculating standard deviation
import statistics
print(statistics.stdev([1, 3, 5, 7, 9, 11]))
print(statistics.stdev([2, 2.5, 1.25, 3.1, 1.75, 2.8]))
print(statistics.stdev([-11, 5.5, -3.4, 7.1]))
print(statistics.stdev([1, 30, 50, 100]))
import numpy as np
x_simple = np.array([-2, -1, 0, 1, 2])
y_simple = np.array([4, 1, 3, 2, 0])
my_rho = np.corrcoef(x_simple, y_simple)
print(my_rho)
import pandas as pd
import scipy.stats
x = [15, 18, 21, 15, 21]
y = [25, 25, 27, 27, 27]
def spearmans_rank_correlation(x, y):
xranks = pd.Series(x).rank()
print("Rankings of X:")
print(xranks)
yranks = pd.Series(y).rank()
print("Rankings of Y:")
print(yranks)
print("Spearman's Rank correlation:", scipy.stats.pearsonr(xranks, yranks)[0])
spearmans_rank_correlation(x, y)
import scipy.stats
NumPy arrays:
# manual construction of an array using the np.array function. The standard convention to import
NumPy is as follows:
>>> p
array([48.858598, 2.294495])
# There are two requirements of a NumPy array: a fixed size at creation and a uniform, fixed data
type, with a fixed size in memory. The following functions help you to get information on
the p matrix:
(2,)
dtype('float64')
Data types:
# We can easily convert or cast an array from one dtype to another using the astype method:
dtype('int64')
>>> float_b.dtype
dtype('float64')
Array creation:
There are various functions provided to create an array object. They are very
useful for us to create and store data in a multidimensional array in different
situations.
eye, identity Create a NxN identity matrix with ones >>> np.eye(2, dtype=np.int)
on the diagonal and zero elsewhere array([[1, 0], [0, 1]])
ones, ones_like Create a new array with the given >>> np.ones(5)
shape and type, filled with 1s for all array([1., 1., 1., 1., 1.])
elements >>> np.ones(4, dtype=np.int)
array([1, 1, 1, 1])
>>> x = np.array([[0,1,2],
[3,4,5]])
>>> np.ones_like(x)
array([[1, 1, 1],[1, 1, 1]])
full, full_like Create a new array with the given >>> np.full((2,2), 3,
shape and type, filled with a selected dtype=np.int)
value array([[3, 3], [3, 3]])
>>> x = np.ones(3)
>>> np.full_like(x, 2)
array([2., 2., 2.])
array Create an array from the existing data >>> np.array([[1.1, 2.2, 3.3],
[4.4, 5.5, 6.6]])
array([1.1, 2.2, 3.3], [4.4, 5.5,
6.6]])
copy Return an array copy of the given object >>> a = np.array([[1, 2], [3, 4]])
>>> np.copy(a)
array([[1, 2], [3, 4]])
fromstring Create 1-D array from a string or text >>> np.fromstring('3.14 2.17',
dtype=np.float, sep=' ')
array([3.14, 2.17])
As with other Python sequence types, such as lists, it is very easy to access and assign a value of each
array's element:
>>> a = np.arange(7)
>>> a
array([0, 1, 2, 3, 4, 5, 6])
(1, 4, 6)
Note
In Python, array indices start at 0. This is in contrast to Fortran or Matlab, where indices begin at 1.
As another example, if our array is multidimensional, we need tuples of integers to index an item:
>>> a[0, 2] = 10
>>> a
>>> b = a[2]
>>> b
array([7, 8, 9])
>>> c = a[:2]
>>> c
We call b and c as array slices, which are views on the original one. It means that the data is not
copied to b or c, and whenever we modify their values, it will be reflected in the array a as well:
>>> b[-1] = 11
>>> a
Fancy indexing
Besides indexing with slices, NumPy also supports indexing with Boolean or integer arrays (masks).
This method is called fancy indexing. It creates copies, not views.
>>> b = (a % 5 == 0)
>>> b
>>> c[b]
[5, 6, 7, 8],
>>> a[[2, 3], [0, 1]] # take elements at (2, 0) and (3, 1)
array([9, 14])
Note
The mask array must have the same length as the axis that it's indexing.
We are getting familiar with creating and accessing ndarrays. Now, we continue to the next step,
applying some mathematical operations to array data without writing any for loops, of course, with
higher performance.
Scalar operations will propagate the value to each element of the array:
>>> a = np.ones(4)
>>> a * 2
>>> a + 3
All arithmetic operations between arrays apply the operation element wise:
>>> a * a
>>> a + a
>>> a == b
array([True, False])
Array functions:
Many helpful array functions are supported in NumPy for analyzing data. We will list some part of
them that are common in use. Firstly, the transposing function is another kind of reshaping form
that returns a view on the original data array without copying anything:
>>> a.reshape(3, 2)
>>> a.T
In general, we have the swapaxes method that takes a pair of axis numbers and returns a view on
the data, without making a copy:
>>> a = np.array([[[0, 1, 2], [3, 4, 5]],
>>> a.swapaxes(1, 2)
array([[[0, 3],
[1, 4],
[2, 5]],
[[6, 9],
[7, 10],
[8, 11]]])
The transposing function is used to do matrix computations; for example, computing the inner
matrix product XT.X using np.dot:
>>> a = np.array([[1, 2, 3],[4,5,6]])
>>> np.dot(a.T, a)
Sorting data in an array is also an important demand in processing data. Let's take a look at some
sorting functions and their use:
>>> b
>>> a[0][b[0]]
array([1, 6, 6, 34])
sin, cos, tan, cosh, sinh, tanh, arcos, arctan, deg2rad Trigonome
>>> a =
tric and
hyperbolic np.array([0.,30.,
functions
45.])
>>> np.sin(a *
np.pi / 180)
array([0., 0.5,
0.7071678])
array([0., 2.])
sqrt, square, exp, expm1, exp2, log, log10, log1p, logaddex Computing
>>>
p the
exponents np.exp(np.array([
and
logarithms 2.25, 3.16]))
of an array
array([9.4877,
23.5705])
a.reshape(2,3)
>>> x2 =
np.arange(3)
Function Description Example
>>>
np.multiply(x1,
x2)
array([[0,1,4],
[0,4,10]])
[True, True,
True]], dtype =
bool)
With the NumPy package, we can easily solve many kinds of data processing tasks without writing
complex loops. It is very helpful for us to control our code as well as the performance of the
program. In this part, we want to introduce some mathematical and statistical functions.
See the following table for a listing of mathematical and statistical functions:
array([5, 9])
array([8, 15])
[2., 2.]])]
array([0, -3])
>>> np.var(a)
1.25
>>> np.intersect1d(a,b)
We can also save and load data to and from a disk, either in text or binary format, by using different
supported functions in NumPy package.
Saving an array
Arrays are saved by default in an uncompressed raw binary format, with the file extension .npy by
the np.save function:
>>> a = np.array([[0, 1, 2], [3, 4, 5]])
>>> np.save('test1.npy', a)
Note
>>> b = np.arange(7)
>>> dic['arr0']
array([0, 1, 2, 3])
Another way to save array data into a file is using the np.savetxt function that allows us to set format
properties in the output file:
>>> x = np.arange(4)
Loading an array
We have two common functions such as np.load and np.loadtxt, which correspond to the saving
functions, for loading an array:
>>> np.load('test1.npy')
Similar to the np.savetxt function, the np.loadtxt function also has a lot of options for loading an
array from a text file.
We can also save and load data to and from a disk, either in text or binary format, by using different
supported functions in NumPy package.
Saving an array
Arrays are saved by default in an uncompressed raw binary format, with the file extension .npy by
the np.save function:
>>> a = np.array([[0, 1, 2], [3, 4, 5]])
>>> np.save('test1.npy', a)
Note
The library automatically assigns the .npy extension, if we omit it.
If we want to store several arrays into a single file in an uncompressed .npz format, we can use
the np.savez function, as shown in the following example:
>>> a = np.arange(4)
>>> b = np.arange(7)
The .npz file is a zipped archive of files named after the variables they contain. When we load
an .npz file, we get back a dictionary-like object that can be queried for its lists of arrays:
>>> dic['arr0']
array([0, 1, 2, 3])
Another way to save array data into a file is using the np.savetxt function that allows us to set format
properties in the output file:
>>> x = np.arange(4)
Loading an array
We have two common functions such as np.load and np.loadtxt, which correspond to the saving
functions, for loading an array:
>>> np.load('test1.npy')
Similar to the np.savetxt function, the np.loadtxt function also has a lot of options for loading an
array from a text file.
Saving an array
Arrays are saved by default in an uncompressed raw binary format, with the file extension .npy by
the np.save function:
>>> a = np.array([[0, 1, 2], [3, 4, 5]])
>>> np.save('test1.npy', a)
Note
>>> b = np.arange(7)
The .npz file is a zipped archive of files named after the variables they contain. When we load
an .npz file, we get back a dictionary-like object that can be queried for its lists of arrays:
>>> dic = np.load('test2.npz')
>>> dic['arr0']
array([0, 1, 2, 3])
Another way to save array data into a file is using the np.savetxt function that allows us to set format
properties in the output file:
>>> x = np.arange(4)
Loading an array
We have two common functions such as np.load and np.loadtxt, which correspond to the saving
functions, for loading an array:
>>> np.load('test1.npy')
Similar to the np.savetxt function, the np.loadtxt function also has a lot of options for loading an
array from a text file.
Loading an array
We have two common functions such as np.load and np.loadtxt, which correspond to the saving
functions, for loading an array:
>>> np.load('test1.npy')
Similar to the np.savetxt function, the np.loadtxt function also has a lot of options for loading an
array from a text file.
Linear algebra is a branch of mathematics concerned with vector spaces and the mappings between
those spaces. NumPy has a package called linalg that supports powerful linear algebra functions. We
can use these functions to find eigenvalues and eigenvectors or to perform singular value
decomposition:
>>> A = np.array([[1, 4, 6],
[5, 2, 2],
[-1, 6, 8]])
>>> w, v = np.linalg.eig(A)
>>> w # eigenvalues
>>> v # eigenvector
The function is implemented using the geev Lapack routines that compute the eigenvalues and
eigenvectors of general square matrices.
Another common problem is solving linear systems such as Ax = b with A as a matrix and x and b as
vectors. The problem can be solved easily using the numpy.linalg.solve function:
>>> A = np.array([[1, 4, 6], [5, 2, 2], [-1, 6, 8]])
>>> b = np.array([[1], [2], [3]])
>>> x = np.linalg.solve(A, b)
>>> x
The following table will summarise some commonly used functions in the numpy.linalg package:
Function Description Example
>>> np.dot(a,b)
>>> np.inner(a,b)
>>> np.linalg.norm(a)
2.23606
-2.0
>>> np.linalg.inv(a)
14.933034
4
8.Python Programs for creation and manipulation of DataFrames
using Pandas Library
Pandas is a Python package that supports fast, flexible, and expressive data structures, as well as
computing functions for data analysis. The following are some prominent features that Pandas
supports:
Data structure with labeled axes. This makes the program clean and clear and avoids
common errors from misaligned data.
Flexible handling of missing data.
Intelligent label-based slicing, fancy indexing, and subset creation of large datasets.
Powerful arithmetic operations and statistical computations on a custom axis via axis label.
Robust input and output support for loading or saving data from and to files, databases, or
HDF5 format.
Related to Pandas installation, we recommend an easy way, that is to install it as a part of Anaconda,
a cross-platform distribution for data analysis and scientific computing. You can refer to the
reference at http://docs.continuum.io/anaconda/ to download and install the library.
After installation, we can use it like other Python packages. Firstly, we have to import the following
packages at the beginning of the program:
Let's first get acquainted with two of Pandas' primary data structures: the Series and the DataFrame.
They can handle the majority of use cases in finance, statistic, social science, and many areas of
engineering.
Series
A Series is a one-dimensional object similar to an array, list, or column in table. Each item in a Series
is assigned to an entry in an index:
>>> s1 = pd.Series(np.random.rand(4),
>>> s1
a 0.6122
b 0.98096
c 0.3350
d 0.7221
dtype: float64
By default, if no index is passed, it will be created to have values ranging from 0 to N-1, where N is
the
length of the Series:
>>> s2 = pd.Series(np.random.rand(4))
>>> s2
0 0.6913
1 0.8487
2 0.8627
3 0.7286
dtype: float64
>>> s1['c']
0.3350
>>>s1['c'] = 3.14
a 0.6122
b 0.98096
This accessing method is similar to a Python dictionary. Therefore, Pandas also allows us to initialize
a Series object directly from a Python dictionary:
'003': 'Peter'})
>>> s3
001 Nam
002 Mary
003 Peter
dtype: object
Sometimes, we want to filter or rename the index of a Series created from a Python dictionary. At
such times, we can pass the selected index list directly to the initial function, similarly to the process
in the above example. Only elements that exist in the index list will be in the Series object.
Conversely, indexes that are missing in the dictionary are initialized to default NaN values by Pandas:
>>> s4 = pd.Series({'001': 'Nam', '002': 'Mary',
>>> s4
002 Mary
001 Nam
024 NaN
065 NaN
dtype: object
ect
002 False
001 False
024 True
065 True
dtype: bool
>>> s5
x 2.71
y 2.71
dtype: float64
A Series object can be initialized with NumPy objects as well, such as ndarray. Moreover, Pandas can
automatically align data indexed in different ways in arithmetic operations:
>>> s6 = pd.Series(np.array([2.71, 3.14]), index=['z', 'y'])
>>> s6
z 2.71
y 3.14
dtype: float64
>>> s5 + s6
x NaN
y 5.85
z NaN
dtype: float64
The DataFrame
The DataFrame is a tabular data structure comprising a set of ordered columns and rows. It can be
thought of as a group of Series objects that share an index (the column names). There are a number
of ways to initialize a DataFrame object. Firstly, let's take a look at the common example of creating
DataFrame from a dictionary of lists:
>>> df1
By default, the DataFrame constructor will order the column alphabetically. We can edit the default
order by passing the column's attribute to the initializing function:
'Median_Age'])
>>> df2
>>> df2.index
>>> df3.index
Columns can be accessed by column name as a Series can, either by dictionary-like notation or as an
attribute, if the column name is a syntactically valid attribute name:
0 Peter
1 Mary
2 Nam
3 Mai
4 John
To modify or append a new column to the created DataFrame, we specify the column name and the
value we want to assign:
>>> df4.ix[1]
name Mary
age 21
career student
province SG
sex F
award None
A DataFrame object can also be created from different data structures such as a list of dictionaries, a
dictionary of Series, or a record array. The method to initialize a DataFrame object is similar to the
examples above.
Another common case is to provide a DataFrame with data from a location such as a text file. In this
situation, we use the read_csv function that expects the column separator to be a comma, by
default. However, we can change that by using the sep parameter:
# person.csv file
name,age,career,province,sex
Peter,16,pupil,TN,M
Mary,21,student,SG,F
Nam,22,student,HN,M
Mai,31,nurse,SG,F
John,28,lawer,SG,M
>>> df4
0 Peter 16 pupil TN M
1 Mary 21 student SG F
2 Nam 22 student HN M
3 Mai 31 nurse SG F
4 John 28 laywer SG M
While reading a data file, we sometimes want to skip a line or an invalid value. As for
Pandas 0.16.2, read_csv supports over 50 parameters for controlling the loading process. Some
common useful parameters are as follows:
sep: This is a delimiter between columns. The default is comma symbol.
dtype: This is a data type for data or columns.
header: This sets row numbers to use as the column names.
skiprows: This skips line numbers to skip at the start of the file.
error_bad_lines: This shows invalid lines (too many fields) that will, by default, cause an
exception, such that no DataFrame will be returned. If we set the value of this parameter
as false, the bad lines will be skipped.
Moreover, Pandas also has support for reading and writing a DataFrame directly from or to a
database such as the read_frame or write_frame function within the Pandas module.
Pandas supports many essential functionalities that are useful to manipulate Pandas data structures.
In this book, we will focus on the most important features regarding exploration and analysis.
Reindex is a critical method in the Pandas data structures. It confirms whether the new or modified
data satisfies a given set of labels along a particular axis of Pandas object.
2 0.8627
b NaN
3 0.7286
dtype: float64
When reindexed labels do not exist in the data object, a default value of NaN will be automatically
assigned to the position; this holds true for the DataFrame case as well:
>>> df1.reindex(index=[0, 2, 'b', 3],
We can change the NaN value in the missing index case to a custom value by setting
the fill_value parameter. Let us take a look at the arguments that the reindex function supports, as
shown in the following table:
Argumen
t Description
method This is the method to use for filling holes in a reindexed object. The default setting is
unfill gaps.
pad/ffill: fill values forward
backfill/bfill: fill values backward
nearest: use the nearest value to fill the gap
level The matches index values on the passed multiple index level.
fill_value This is the value to use for missing values. The default setting is NaN.
limit This is the maximum size gap to fill in forward or backward method.
>>> s7 = pd.Series(np.random.rand(10000))
>>> s7.head()
0 0.631059
1 0.766085
2 0.066891
3 0.867591
4 0.339678
dtype: float64
>>> s7.tail(3)
9997 0.412178
9998 0.800711
9999 0.438344
dtype: float64
We can also use these functions for DataFrame objects in the same way.
Binary operations
Firstly, we will consider arithmetic operations between objects. In different indexes objects case, the
expected result will be the union of the index pairs. We will not explain this again because we had an
example about it in the above section (s5 + s6). This time, we will show another example with a
DataFrame:
>>> df5 = pd.DataFrame(np.arange(9).reshape(3,3),0
columns=['a','b','c'])
>>> df5
a b c
0 0 1 2
1 3 4 5
2 6 7 8
columns=['a','b','c','d'])
>>> df6
a b c d
0 0 1 2 3
1 4 5 6 7
a b c d
0 0 2 4 NaN
1 7 9 11 NaN
The mechanisms for returning the result between two kinds of data structure are similar. A problem
that we need to consider is the missing data between objects. In this case, if we want to fill with a
fixed value, such as 0, we can use the arithmetic functions such as add, sub, div, and mul, and the
function's supported parameters such as fill_value:
>>> df7 = df5.add(df6, fill_value=0)
>>> df7
a b c d
0 0 2 4 3
1 7 9 11 7
2 6 7 8 NaN
a b c d
0 True True True False
Functional statistics
The supported statistics method of a library is really important in data analysis. To get inside a big
data object, we need to know some summarized information such as mean, sum, or quantile. Pandas
supports a large number of methods to compute them. Let's consider a simple example of
calculating the sum information of df5, which is a DataFrame object:
>>> df5.sum()
a 9
b 12
c 15
dtype: int64
When we do not specify which axis we want to calculate sum information, by default, the function
will calculate on index axis, which is axis 0:
Series: We do not need to specify the axis.
DataFrame: Columns (axis = 1) or index (axis = 0). The default setting is axis 0.
We also have the skipna parameter that allows us to decide whether to exclude missing data or not.
By default, it is set as true:
>>> df7.sum(skipna=False)
a 13
b 18
c 23
d NaN
dtype: float64
Another function that we want to consider is describe(). It is very convenient for us to summarize
most of the statistical information of a data structure such as the Series and DataFrame, as well:
>>> df5.describe()
a b c
count 3.0 3.0 3.0
We can specify percentiles to include or exclude in the output by using the percentiles parameter;
for example, consider the following:
>>> df5.describe(percentiles=[0.5, 0.8])
a b c
Here, we have a summary table for common supported statistics functions in Pandas:
Function Description
idxmin(axis), idxmax(axis) This compute the index labels with the minimum or maximum
corresponding values.
mean(), median(), min(), max( This return mean, median, minimum, and maximum values of
) an axis in a data object.
std(), var(), sem() These return the standard deviation, variance, and standard
Function Description
error of mean.
Function application
Pandas supports function application that allows us to apply some functions supported in other
packages such as NumPy or our own functions on data structure objects. Here, we illustrate two
examples of these cases, firstly, using apply to execute the std() function, which is the standard
deviation calculating function of the NumPy package:
>>> df5.apply(np.std, axis=1) # default: axis=0
0 0.816497
1 0.816497
2 0.816497
dtype: float64
Secondly, if we want to apply a formula to a data object, we can also useapply function by following
these steps:
1. Define the function or formula that you want to apply on a data object.
2. Call the defined function or formula via apply. In this step, we also need to figure out the
axis that we want to apply the calculation to:
3. >>> f = lambda x: x.max() – x.min() # step 1
5. 0 2
6. 1 2
7. 2 2
8. dtype: int64
12. a b c
Sorting
There are two kinds of sorting method that we are interested in: sorting by row or column index and
sorting by data value.
Firstly, we will consider methods for sorting by row and column index. In this case, we have
the sort_index () function. We also have axis parameter to set whether the function should sort by
row or column. The ascending option with the true or false value will allow us to sort data in
ascending or descending order. The default setting for this option is true:
>>> df7 = pd.DataFrame(np.arange(12).reshape(3,4),
>>> df7
b d a c
x 0 1 2 3
y 4 5 6 7
z 8 9 10 11
>>> df7.sort_index(axis=1)
a b c d
x 2 0 3 1
y 6 4 7 5
z 10 8 11 9
Series has a method order that sorts by value. For NaN values in the object, we can also have a
special treatment via the na_position option:
>>> s4.order(na_position='first')
024 NaN
065 NaN
002 Mary
001 Nam
dtype: object
>>> s4
002 Mary
001 Nam
024 NaN
065 NaN
dtype: object
Besides that, Series also has the sort() function that sorts data by value. However, the function will
not return a copy of the sorted data:
>>> s4.sort(na_position='first')
>>> s4
024 NaN
065 NaN
002 Mary
001 Nam
dtype: object
If we want to apply sort function to a DataFrame object, we need to figure out which columns or
rows will be sorted:
b d a c
z 8 9 10 11
y 4 5 6 7
x 0 1 2 3
if we do not want to automatically save the sorting result to the current data object, we can change
the setting of the inplace parameter to False.
9. Write a Python program for the
following.
• Simple Line Plots,
• Adjusting the Plot: Line Colors and Styles, Axes Limits, Labeling Plots,
• Simple Scatter Plots,
• Histograms,
• Customizing Plot Legends,
• Choosing Elements for the Legend,
• Boxplot
• Multiple Legends,
• Customizing Colorbars,
• Multiple Subplots,
• Text and Annotation,
• Customizing Ticks
# Simple Line Plots and Adjusting the Plot: Line Colors and Styles, Axes Limits, Labeling Plots
The easiest way to get started with plotting using matplotlib is often by using the MATLAB API that is
supported
by the package:
>>> import matplotlib.pyplot as plt
>>> x = linspace(0, 3, 6)
>>> x
>>> y = power(x,2)
>>> y
>>> figure()
>>> xlabel('x')
>>> ylabel('y')
>>> plt.show()
However, star imports should not be used unless there is a good reason for doing so. In the case of
matplotlib, we can use the canonical import:
>>> import matplotlib.pyplot as plt
>>> plt.ylabel('y')
>>> plt.show()
If we only provide a single argument to the plot function, it will automatically use it as the y values
and generate the x values from 0 to N-1, where N is equal to the number of values:
>>> plt.plot(y)
>>> plt.xlabel('x')
>>> plt.ylabel('y')
>>> plt.show()
Line properties
The default line format when we plot data in matplotlib is a solid blue line, which is abbreviated
as b-. To change this setting, we only need to add the symbol code, which includes letters as color
string and symbols as line style string, to the plot function. Let us consider a plot of several lines with
different format styles:
>>> plt.plot(x*2, 'g^', x*3, 'rs', x**x, 'y-')
>>> plt.show()
>>> line.set_linestyle('--')
>>> plt.show()
The following table lists some common properties of the line2d plotting:
Property Value type Description
color or c Any matplotlib color This sets the color of the line in the figure
data np.array xdata, np.array ydat This sets the data used for visualization
a
linestyle or ls [ '-' | '—' |'-.' | ':' | ...] This sets the line style in the figure
linewidth or l Float value in points This sets the width of line in the figure
w
marker Any symbol This sets the style at data points in the
figure
# Simple Scatter Plots,
Scatter plots
A scatter plot is used to visualize the relationship between variables measured in the same dataset.
It is easy to plot a simple scatter plot, using the plt.scatter() function, that requires numeric columns
for both the x and y axis:
>>> plt.show()
# Histograms
Histogram plots
A histogram represents the distribution of numerical data graphically. Usually, the range of values is
partitioned into bins of equal size, with the height of each bin corresponding to the frequency of
values within that bin:
facecolor='g', alpha=0.75)
histtype='bar', rwidth=0.8)
>>> ax1.set_title('uniquel bins histogram')
>>> plt.tight_layout()
>>> plt.show()
>>> y1 = np.sin(x)
>>> y2 = np.cos(x)
>>> y3 = np.tan(x)
>>> plt.show()
If we want to split the legend into multiple boxes in a figure, we can manually set our expected
labels for plot lines, as shown in the following image:
>>> # with above code, only 'y=tan(x)' legend appears in the figure
>>> plt.gca().add_artist(lsin)
>>> plt.gca().add_artist(lcos)
>>> plt.tight_layout()
>>> plt.show()
The other element in a figure that we want to introduce is the annotations which can consist of text,
arrows, or other shapes to explain parts of the figure in detail, or to emphasize some special data
points. There are different methods for showing annotations, such as text, arrow, and annotation.
The text method draws text at the given coordinates (x, y) on the plot; optionally with
custom properties. There are some common arguments in the function: x, y, label text, and
font-related properties that can be passed in via fontdict, such as family, fontsize, and style.
The annotate method can draw both text and arrows arranged appropriately. Arguments of
this function are s (label text), xy (the position of element to annotation), xytext (the
position of the label s), xycoords (the string that indicates what type of coordinate xy is),
and arrowprops (the dictionary of line properties for the arrow that connects the
annotation).
Here is a simple example to illustrate the annotate and text functions:
>>> x = np.linspace(-2.4, 0.4, 20)
fontsize=14, style='italic')
xytext=(-1, 0.3),
horizontalalignment='center',
verticalalignment='top',
arrowprops=dict(arrowstyle='->',
connectionstyle='arc3'))
>>> plt.show()
>>> plt.legend()
>>> plt.show()
Another example will visualize the data of a DataFrame object consisting of multiple columns:
>>> data = {'Median_Age': [24.2, 26.4, 28.5, 30.3],
>>> plt.tight_layout();
>>> plt.show()
The plot method of the DataFrame has a number of options that allow us to handle the plotting of
the columns. For example, in the above DataFrame visualization, we chose to plot the columns in
separate subplots. The following table lists more options:
Argument Value Description
sharex, share True/False The shares the same x or y axis, linking sticks and limits
y
10. Python Programs for Data preprocessing:
Handling missing values, handling categorical
data, bringing features to same scale,
selecting meaningful features
Importing the libraries:
# libraries
import numpy as np # used for handling numbers
import pandas as pd # used for handling the dataset
from sklearn.impute import SimpleImputer # used for handling missing datafrom sklearn.preprocessing
import LabelEncoder, OneHotEncoder # used for encoding categorical datafrom sklearn.model_selection
import train_test_split # used for splitting training and testing datafrom sklearn.preprocessing
import StandardScaler # used for feature scaling
If you select and run the above code in Spyder, you should see a similar output in your
IPython console.
If you see any import errors, try to install those packages explicitly using pip command as
follows.
pip install <package-name>
First of all, let us have a look at the dataset we are going to use for this particular
example. You can find the
https://github.com/tarunlnmiit/machine_learning/blob/master/DataPreprocessing.csv
In order to import this dataset into our script, we are apparently going to use
pandas as follows.
dataset = pd.read_csv('Data.csv') # to import the dataset into a variable# Splitting
the attributes into independent and dependent attributes
When you run this code section, you should not see any errors, if you do make
sure the script and the Data.csv are in the same folder. When successfully
executed, you can move to variable explorer in the Spyder UI and you will see the
following three variables.
When you double click on each of these variables, you should see something
similar.
If you face any errors in order to see these data variables, try to upgrade Spyder
to Spyder version 4.
Well the first idea is to remove the lines in the observations where there is some
missing data. But that can be quite dangerous because imagine this data set
contains crucial information. It would be quite dangerous to remove an
observation. So we need to figure out a better idea to handle this problem. And
another idea that’s actually the most common idea to handle missing data is to
take the mean of the columns.
If you noticed in our dataset, we have two values missing, one for age column in
7th data row and for Income column in 5th data row. Missing values should be
handled during the data analysis. So, we do that as follows.
# handling the missing data and replace missing values with nan from numpy and
replace with mean of all the other values
After execution of this code, the independent variable X will transform into the
following.
Here you can see, that the missing values have been replaced by the average
values of the respective columns.
In this dataset we can see that we have two categorical variables. We have the
Region variable and the Online Shopper variable. These two variables are
categorical variables because simply they contain categories. The Region
contains three categories. It’s India, USA & Brazil and the online shopper
variable contains two categories. Yes and No that’s why they’re called
categorical variables.
You can guess that since machine learning models are based on mathematical
equations you can intuitively understand that it would cause some problem if we
keep the text here in the categorical variables in the equations because we would
only want numbers in the equations. So that’s why we need to encode the
categorical variables. That is to encode the text that we have here into numbers.
To do this we use the following code snippet.
Here, you can see that the Region variable is now made up of a 3 bit binary
variable. The left most bit represents India, 2nd bit represents Brazil and the
last bit represents USA. If the bit is 1 then it represents data for that country
otherwise not. For Online
Shopper variable, 1 represents Yes and 0 represents No.
Splitting the dataset into training and testing datasets
Here, we are taking training set to be 80% of the original data set and testing set
to be 20% of the original data set. This is usually the ratio in which they are split.
But, you can come across sometimes to a 70–30% or 75–25% ratio split. But, you
don’t want to split it 50–50%. This can lead to Model Overfitting. This topic is
too huge to be covered in the same post. I will cover it in some future post. For
now, we are going to split it in 80–20% ratio.
After split, our training set and testing set look like this.
Feature Scaling
As you can see we have these two columns age and income that contains
numerical numbers. You notice that the variables are not on the same scale
because the age are going from 32 to 55 and the salaries going from 57.6 K to
like 99.6 K.
So because this age variable in the salary variable don’t have the same scale.
This will cause some issues in your machinery models. And why is that. It’s
because your machine models a lot of machinery models are based on what is
called the Euclidean distance.
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
After the execution of this code, our training independent variable X and our
testing independent variable X and look like this.