Web GPU
Web GPU
Abdul Dakkak
!1
Overview
IMPACT "
History"
Architecture"
Some data"
Lessons"
Future
01
!2
Quick Demo
!3
IMPACT
Objective
Create a programming
environment for the Coursera
course"
01
!5
Previous System
New System
01
!7
Architecture
!8
Library+!
User Code
Life of a
Worker
Program
Submission
Library+!
User Code
Library+!
User Code
Worker
Worker
Web!
Server
DB
Cours-!
era
!1010
Detailed Architecture
!11
Made it public,
so got some help
from students
Correlated and
source of major
problems (happens
on Cyclone and AWS)
Source of Bugs
Incorrect or
inefficient
queries
!12
db
ox
ed
ro
ot
Sa
n
no
n-
Library+!
User Code
Library+!
User Code
Library+!
User Code
Worker
Worker
Worker
Web!
Server
DB
Security
Sandbox users code"
no
n-
ro
ot
Cours-!
era
!1013
How to Scale?
Web!
Server
DB
Worker
Web!
Server
Worker
!1014
Scale
What works for 2 people may not work for 1000 people"
What works for 1000 people may not work when all are logged in
at the same time"
What works for 1000 people on day 1 may not work for 1000
people on day 50
!15
!18
Lessons Learned
You will feel discouraged and moody for the rest of the day"
You will answer the same thing over and over and over again"
People will send you their code and ask you to debug them"
They will feel like you are not doing your job if you tell them no
!20
Data
!21
Data Collected
Google Analytics"
CPU/GPU information"
Thousands of Visitors
!23
Grades
!26
939 actually started the course XXXX this is not correct FIXME"
n
D
ow
M
P1
D
ue
ue
D
W
eb
si
te
,3
P2
M
People Submit at
the Last Minute
Grade Timeline
01
Students Work
Program save timeline
!30
!
!
#define BLOCK_SIZE 16
__global__ void MatMulKernel(float *DA, float *DB, float *DC, int Ah, int Aw,
int AwTiles, int Bh, int Bw) {
// Block row and column
if (cellRow >= Ah)
int blockRow = blockIdx.y;
return;
int blockCol = blockIdx.x;
if (cellCol >= Bw)
return;
// Thread row and column within Csub
int row = threadIdx.y;
DC[cellRow * Bw + cellCol] = Cvalue;
int col = threadIdx.x;
}
!
!
!
!
//
//
//
//
!
!
#pragma unroll
for (int m = 0; m < AwTiles; ++m) {
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
!
!
!
!
!!
//
//
CITE: Vasily Volkov, UC Berkeley.... Intresting approach to using
//caching a bit differently... had to give it a try...
//
__device__ void saxpy(float a, float *b, float *c) {
c[0] += a * b[0];
c[1] += a * b[1];
c[2] += a * b[2];
c[3] += a * b[3];
c[4] += a * b[4];
c[5] += a * b[5];
c[6] += a * b[6];
c[7] += a * b[7];
c[8] += a * b[8];
c[9] += a * b[9];
c[10] += a * b[10];
c[11] += a * b[11];
c[12] += a * b[12];
c[13] += a * b[13];
c[14] += a * b[14];
c[15] += a * b[15];
}
!
!
!
!
!
!
A += 4 * lda;
saxpy(a[0], &bs[0][0],
a[0] = A[0 * lda];
saxpy(a[1], &bs[1][0],
a[1] = A[1 * lda];
saxpy(a[2], &bs[2][0],
a[2] = A[2 * lda];
saxpy(a[3], &bs[3][0],
a[3] = A[3 * lda];
c);
c);
c);
c);
A += 4 * lda;
saxpy(a[0], &bs[4][0],
a[0] = A[0 * lda];
saxpy(a[1], &bs[5][0],
a[1] = A[1 * lda];
saxpy(a[2], &bs[6][0],
a[2] = A[2 * lda];
saxpy(a[3], &bs[7][0],
a[3] = A[3 * lda];
c);
c);
c);
c);
A += 4 * lda;
saxpy(a[0], &bs[8][0], c);
a[0] = A[0 * lda];
saxpy(a[1], &bs[9][0], c);
a[1] = A[1 * lda];
saxpy(a[2], &bs[10][0], c);
a[2] = A[2 * lda];
saxpy(a[3], &bs[11][0], c);
a[3] = A[3 * lda];
A += 4 * lda;
saxpy(a[0], &bs[12][0],
saxpy(a[1], &bs[13][0],
saxpy(a[2], &bs[14][0],
saxpy(a[3], &bs[15][0],
c);
c);
c);
c);
SGEMM Implementation
!
!
__global__ void optimisedDLA(const float *A, int lda, const float *B, int ldb,
B += 16;
float *C, int ldc, int k) {
__syncthreads();
const int inx = threadIdx.x;
}
while (B < Blast);
const int iny = threadIdx.y;
const int ibx = blockIdx.x * 64;
for (int i = 0; i < 16; i++, C += ldc)
const int iby = blockIdx.y * 16;
C[0] = c[i];
const int id = inx + iny * 16;
}
.
A += ibx + id;
!
!
!
!
!
!33
Analysis Opportunities
Source of Data
Lessons Learned
!37
Current Work
!38
Current Work
Questions?
!40