# 讲解 Data Science and Applications调试Python程序

Data Science and Applications

Lecture 1: Visualization, Probability and Statistics

Outline

Visualization using Python

Review of Probability Essentials

Review of Statistics Essentials

Visualization Basics

The first step in understanding data is visualization. This gives the best insight.

There are different ways we can visualize data

- as simple plots

- as dots in the space, colored by class (clustering)

- as frequency of appearance (histograms)

- any of the above after pre-processing the data by some algorithm

We will use Python 3 for all our visualization. And I will use IDLE for editing and running all python codes. You will get sample codes in a file that will include sample codes.

Make sure you install python asap and be able to run the simplest program:

print(“Hello Class of Data Science in Python – Winter 2021!”)

You should save this code in a file hello.py and if you use IDLE press F5 or look for run menu.

Simple plot

#start the code with the 3 lines:

import matplotlib as mpl

import matplotlib.pyplot as plt

import numpy as np

#define a style.

plt.style.use('classic’)

# create an array of 100 numbers from 0 to 10

x = np.linspace(0, 10, 100)

#prepare a figure and plot

fig = plt.figure()

plt.plot(x, np.sin(x), '-')

plt.plot(x, np.cos(x), '--');

#plot it

plt.show()

#save the figure into a file

fig.savefig('my_figure.png')

Simple plot – cont.

#start the code with the 3 lines:

import matplotlib as mpl

import matplotlib.pyplot as plt

import numpy as np

#define a style.

plt.style.use('classic’)

# create an array of 100 numbers from 0 to 10

x = np.linspace(0, 10, 100)

#prepare a figure and plot

plt.figure() # create a plot figure

# create the first of two panels and set current axis

plt.subplot(2, 1, 1) # (rows, columns, panel number)

plt.plot(x, np.sin(x))

# create the second panel and set current axis

plt.subplot(2, 1, 2)

plt.plot(x, np.cos(x));

#plot it

plt.show()

Simple plot - cont

#start the code with the 3 lines:

import matplotlib as mpl

import matplotlib.pyplot as plt

# First create a grid of plots

# ax will be an array of two Axes objects

fig, ax = plt.subplots(2)

# Call plot() method on the appropriate object

ax[0].plot(x, np.sin(x))

ax[1].plot(x, np.cos(x));

#plot it

plt.show()

#save the figure into a file

fig.savefig('my_figure.png')

Line plots

%matplotlib inline

import matplotlib.pyplot as plt

plt.style.use('seaborn-whitegrid')

import numpy as np

fig = plt.figure()

ax = plt.axes()

plt.plot(x, np.sin(x - 0), color='blue') # specify color by name

plt.plot(x, np.sin(x - 1), color='g') # short color code (rgbcmyk)

plt.plot(x, np.sin(x - 2), color='0.75') # Grayscale between 0 and 1

plt.plot(x, np.sin(x - 3), color='#FFDD44') # Hex code (RRGGBB from 00 to FF)

plt.plot(x, np.sin(x - 4), color=(1.0,0.2,0.3)) # RGB tuple, values 0 to 1

plt.plot(x, np.sin(x - 5), color='chartreuse’) # all HTML color names supported

plt.show()

Line plots – cont.

import matplotlib.pyplot as plt

plt.style.use('seaborn-whitegrid')

import numpy as np

fig = plt.figure()

ax = plt.axes()

x = np.linspace(0, 10, 100)

plt.plot(x, x + 0, linestyle='solid')

plt.plot(x, x + 1, linestyle='dashed')

plt.plot(x, x + 2, linestyle='dashdot')

plt.plot(x, x + 3, linestyle='dotted');

# OR For short, you can use the following codes:

plt.plot(x, x + 4, linestyle='-') # solid

plt.plot(x, x + 5, linestyle='--') # dashed

plt.plot(x, x + 6, linestyle='-.') # dashdot

plt.plot(x, x + 7, linestyle=':'); # dotted

plt.show()

Line plots – cont.

import matplotlib.pyplot as plt

plt.style.use('seaborn-whitegrid')

import numpy as np

fig = plt.figure()

ax = plt.axes()

x = np.linspace(0, 10, 100)

plt.plot(x, x + 0, '-g') # solid green

plt.plot(x, x + 1, '--c') # dashed cyan

plt.plot(x, x + 2, '-.k') # dashdot black

plt.plot(x, x + 3, ':r'); # dotted red

plt.show()

Line plots – cont.

import matplotlib.pyplot as plt

plt.style.use('seaborn-whitegrid')

import numpy as np

fig = plt.figure()

ax = plt.axes()

x = np.linspace(0, 10, 100)

plt.plot(x, np.sin(x))

plt.xlim(-1, 11)

plt.ylim(-1.5, 1.5);

plt.show()

plt.plot(x, np.sin(x))

plt.title("A Sine Curve")

plt.xlabel("x")

plt.ylabel("sin(x)");

1D Histograms plots

import numpy as np

import matplotlib.pyplot as plt

plt.style.use('seaborn-white')

data = np.random.randn(1000)

plt.hist(data)

plt.hist(data, bins=30, density=True, alpha=0.5,

histtype='stepfilled', color='steelblue',

edgecolor='none’)

x1 = np.random.normal(0, 0.8, 1000)

x2 = np.random.normal(-2, 1, 1000)

x3 = np.random.normal(3, 2, 1000)

kwargs = dict(histtype='stepfilled’,

alpha=0.3, density=True, bins=40)

plt.hist(x1, **kwargs)

plt.hist(x2, **kwargs)

plt.hist(x3, **kwargs);

2D Histograms plots

mean = [0, 0]

cov = [[1, 1], [1, 2]]

x, y = np.random.multivariate_normal(mean, cov, 10000).T

plt.hist2d(x, y, bins=30, cmap='Blues')

cb = plt.colorbar()

cb.set_label('counts in bin')

Kernel Density Estimation (KDE)

from scipy.stats import gaussian_kde

import numpy as np

import matplotlib.pyplot as plt

mean = [0, 0]

cov = [[1, 1], [1, 2]]

x, y = np.random.multivariate_normal(mean, cov, 10000).T

# fit an array of size [Ndim, Nsamples]

data = np.vstack([x, y])

kde = gaussian_kde(data)

# evaluate on a regular grid

xgrid = np.linspace(-3.5, 3.5, 40)

ygrid = np.linspace(-6, 6, 40)

Xgrid, Ygrid = np.meshgrid(xgrid, ygrid)

Z = kde.evaluate(np.vstack([Xgrid.ravel(), Ygrid.ravel()]))

# Plot the result as an image

plt.imshow(Z.reshape(Xgrid.shape),

rigin='lower', aspect='auto',

extent=[-3.5, 3.5, -6, 6],

cmap='Blues')

cb = plt.colorbar()

cb.set_label("density")

plt.show()

Example: Digits

# load images of the digits 0 through 5 and visualize

fig, ax = plt.subplots(8, 8, figsize=(6, 6))

for i, axi in enumerate(ax.flat):

axi.imshow(digits.images[i], cmap='binary')

axi.set(xticks=[], yticks=[])

# project the digits into 2 dimensions using IsoMap

from sklearn.manifold import Isomap

iso = Isomap(n_components=2)

projection = iso.fit_transform(digits.data)

# plot the results

plt.scatter(projection[:, 0], projection[:, 1], lw=0.1,

c=digits.target, cmap=plt.cm.get_cmap('cubehelix', 6))

plt.colorbar(ticks=range(6), label='digit value')

plt.clim(-0.5, 5.5)

Datasets

A comprehensive dataset site is Kaggle https://www.kaggle.com/datasets

Now that we have the basics of visualization We can pick any dataset and try to discover Interesting things about it.

We will get back to this question with more Tools as we progress in this course.