1. Cliff Biffle
  2. b3-computer

Source

b3-computer / kernal / vga-driver.s

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
@ A text-mode VGA driver for the LPC1768.
@ Copyright (C) 2010 Cliff L. Biffle

@
@ Theory of operation (currently incomplete)
@
@ This generates "VGA" video, though this is really a misnomer.  In this context
@ "VGA" means
@ - Analog video with separated R/G/B components,
@ - Separated sync, both horizontal and vertical,
@ - Pixel and sync clocks that match those generated by late-80s IBM PCs.
@
@ This is one of the best-supported video formats on the planet.
@
@ We use a single channel of PWM (PWM1.2) to generate the horizontal sync
@ signal without CPU intervention.  For all video modes currently under
@ consideration, the horizontal sync rate is 31.469kHz.  We approximate this
@ to 100MHz / 4 / 800 = 31.250kHz -- 0.6% error.  The hsync signal is negative
@ polarity (the sync pulse goes low).
@
@ We generate the pixel clock by dividing the 100MHz system clock down by 4,
@ to produce a 25MHz clock.  This is technically wrong by the same amount
@ as hsync (a little over 0.6%), but monitors seem to do okay.  The pixels
@ themselves are clocked out eight at a time by the I2S unit.
@
@ To render text into graphics, we use a small assembly routine that produces
@ pixels one display line at a time.  The line is then fed to the I2S unit
@ automatically using DMA.  The routine is invoked by the DMA Terminal Count
@ interrupt, once per line.
@
@ The rendering routine is built as a state machine.  More information is
@ provided below at its implementation.

.syntax unified
.cpu cortex-m3
.thumb

.include "common.s"
.include "vga-header.s"

@
@ LPC1768 / Cortex-M3 peripheral register addresses
.equ I2STXFIFO, 0x400A8008

.equ DMA_base, 0x50004000
.equ DMA_to_Config, 0x030
.equ DMA_to_IntTCClear, 0x008
.equ DMAC0_base, 0x50004100
.equ DMAC_to_Config, 0x10

.equ GPIO_base, 0x2009C000

.equ SC_base, 0x400FC000
.equ SC_to_PCONP,    0x0C4
.equ SC_to_PCLKSEL0, 0x1A8
.equ SC_to_PCLKSEL1, 0x1AC

.equ PINSEL_base, 0x4002C000
.equ PINSEL_to_PINSEL0, 0x000
.equ PINSEL_to_PINSEL4, 0x010
.equ PINSEL_to_PINMODE4, 0x050

.equ I2S_base, 0x400A8000
.equ I2S_to_DAO, 0x000
.equ I2S_to_DMA1, 0x014
.equ I2S_to_TXRATE, 0x020

.equ PWM_base, 0x40018000
.equ PWM_to_TCR, 0x004
.equ PWM_to_TC,  0x008
.equ PWM_to_PR,  0x00C
.equ PWM_to_MCR, 0x014
.equ PWM_to_MR0, 0x018
.equ PWM_to_MR1, 0x01C
.equ PWM_to_MR2, 0x020
.equ PWM_to_PCR, 0x04C

@
@ Pin configuration
.equ vsync_gpio_port, 0
.equ vsync_gpio_pin, 6

.section .text

@
@ vga_init
@ Initializes the display driver using canned settings.  Note that the DMA IRQ
@ handler must already be set, or horrible things will happen!
.globl vga_init
.thumb_func
vga_init:
          push {LR}

          @ Enable output on the VSYNC gpio (P0.6).
          movw r1, #:lower16:(GPIO_base + (vsync_gpio_port * 0x20))
          movt r1, #:upper16:(GPIO_base + (vsync_gpio_port * 0x20))
          mov r0, #(1 << vsync_gpio_pin)
          str r0, [r1]

          @ Initialize the contents of the control structure at vga_base
          movw r0, #:lower16:vga_base
          movt r0, #:upper16:vga_base
          mov r1, #0
          str r1, [r0, #vga_to_line]    @ vga_line = 0
          str r1, [r0, #vga_to_state]   @ vga_state = 0

          add r1, r0, #(data_buf1 - vga_base)
          str r1, [r0, #vga_to_buf]     @ vga_buf = data_buf1

          adr r1, vgas_before_vsync+1
          str r1, [r0, #vga_to_next]    @ vga_next = vgas_before_vsync

          movw r1, #:lower16:font_buf
          movt r1, #:upper16:font_buf
          str r1, [r0, #vga_to_font]    @ vga_font = font_buf

          movw r1, #:lower16:text_buf
          movt r1, #:upper16:text_buf
          str r1, [r0, #vga_to_text]    @ vga_text = text_buf
          adds r1, #(80 * 24)           @ Compute beginning of last line.
          str r1, [r0, #vga_to_cursor_pos] @ vga_cursor_pos = last line

          @ Copy the font.
          movw r0, #:lower16:font_table
          movt r0, #:upper16:font_table

          movw r1, #:lower16:font_buf
          movt r1, #:upper16:font_buf

          mov r2, #(128 * 16)
1:        subs r2, #4
          ldr r3, [r0], #4
          str r3, [r1], #4
          beq 1f
          b 1b

1:        @ Initialize the hardware
          bl setup_dma
          bl i2s_init
          bl hsync_init

          @ Switch on DMA and PWM as close together as possible.

          mov r2, #(1 | (1 << 3))       @ Timer enable word in r2
          add r3, r1, #PWM_to_TCR       @ Address of TCR in r3.

          @ I2S DMA config register address in r1
          movw r1, #:lower16:(I2S_base + I2S_to_DMA1)
          movt r1, #:upper16:(I2S_base + I2S_to_DMA1)
          @ I2S DMA config word in r0
          movs r0, #(1 << 1)
          orr r0, #(2 << 16)

          @ Okay, now, real quick:
          str r2, [r3]
          str r0, [r1]
          
          pop {PC}

i2s_init:
          @ Enable power to the I2S
          movw r1, #:lower16:SC_base              @ Power and clocking
          movt r1, #:upper16:SC_base   
          ldr r2, [r1, #SC_to_PCONP]
          orr r2, #(1 << 27)
          str r2, [r1, #SC_to_PCONP]

          @ Clock from MCLK
          ldr r2, [r1, #SC_to_PCLKSEL1]
          orr r2, #(1 << 22)
          str r2, [r1, #SC_to_PCLKSEL1]

          @ Set the pin function
          movt r1, #:upper16:PINSEL_base          @ Lower half unchanged
          ldr r2, [r1, #PINSEL_to_PINSEL0]
          orr r2, #((1 << 18) | (1 << 16))
          str r2, [r1, #PINSEL_to_PINSEL0]

          @ Set the fractional divider
          movw r1, #:lower16:I2S_base   @ I2S control block
          movt r1, #:upper16:I2S_base   @ I2S control block

          @ This assumes a 100MHz clock.
          movw r2, #0x0104
          str r2, [r1, #I2S_to_TXRATE]

          @ Set the audio output parameters.
          movw r2, #(0 | (0 << 2) | (0x07 << 6))
          @  (0 << 0)  8-bit data
          @  (0 << 2)  stereo
          @  (8 - 1 << 6)  8 clocks per WS half-period.
          str r2, [r1, #I2S_to_DAO]

          bx LR
          

setup_dma:
          @ Enable power to DMA controller.
          movw r1, #:lower16:(SC_base + SC_to_PCONP)
          movt r1, #:upper16:(SC_base + SC_to_PCONP)
          ldr r2, [r1]
          orr r2, #(1 << 29)
          str r2, [r1]

          movw r1, #:lower16:DMA_base
          movt r1, #:upper16:DMA_base

          @ Enable DMA controller.
          movs r2, #1
          str r2, [r1, #DMA_to_Config]
          @ Spin until it confirms.
1:        ldr r2, [r1, #DMA_to_Config]
          tst r2, #1
          beq 1b

          @ Load control registers from LLI.
          movw r2, #:lower16:dma_lli0
          movt r2, #:upper16:dma_lli0
          adds r1, #(DMAC0_base - DMA_base)
          push {r3, r4, r5, r6}
          ldm r2, {r3, r4, r5, r6}
          stm r1, {r3, r4, r5, r6}
          pop {r3, r4, r5, r6}

          @ Configure channel 0.
          movw r2, #(1 | (5 << 6) | (1 << 11) | (1 << 14) | (1 << 15))
          str r2, [r1, #DMAC_to_Config]

          @ Enable IRQ.
          movw r1, #:lower16:(NVIC_base + NVIC_to_ISER0)
          movt r1, #:upper16:(NVIC_base + NVIC_to_ISER0)
          mov r2, #(1 << NVIC_DMA_IRQ)
          str r2, [r1]

          bx LR

.globl hsync_init
.thumb_func
hsync_init:
          @ Enable PWM power.
          movw r1, #:lower16:SC_base
          movt r1, #:upper16:SC_base

          ldr r2, [r1, #SC_to_PCONP]
          orr r2, #(1 << 6)             @ Power to the PWM
          str r2, [r1, #SC_to_PCONP]

          @ Set up PWM clocking and pins.
          ldr r2, [r1, #SC_to_PCLKSEL0]
          orr r2, #(1 << 12)            @ Clock directly from CCLK
          str r2, [r1, #SC_to_PCLKSEL0]

          movt r1, #:upper16:PINSEL_base          @ Lower half unchanged
          ldr r2, [r1, #PINSEL_to_PINSEL4]
          bics r2, #(3 << 2)            @ Clear PIN2.1 function
          orrs r2, #(1 << 2)            @ Set PWM1.2
          str r2, [r1, #PINSEL_to_PINSEL4]

          ldr r2, [r1, #PINSEL_to_PINMODE4]
          bics r2, #(3 << 2)            @ Clear pull state
          orrs r2, #(2 << 2)            @ Disable pullup/pulldown
          str r2, [r1, #PINSEL_to_PINMODE4]

          movw r1, #:lower16:PWM_base
          movt r1, #:upper16:PWM_base

          movs r2, #(4 - 1)             @ Divide clock by 4.
                                        @ 100mhz/4 = 25mhz
          str r2, [r1, #PWM_to_PR]
          str r2, [r1, #PWM_to_MCR]

          @ Set up Hsync cycle (slightly out of phase with the pixel cycle)
          movs r2, #800                 @ 800 pixel cycle
          str r2, [r1, #PWM_to_MR0]
          movs r2, #97                  @ Go high at pixel 97
          str r2, [r1, #PWM_to_MR1]
          movs r2, #1                   @ Go low at pixel 1
          str r2, [r1, #PWM_to_MR2]

          @ HSync fine adjust: nudges the whole image right by an integral
          @ number of pixels (by nudging the hsync left).
          movw r2, #4
          str r2, [r1, #PWM_to_TC]

          movw r2, #((1 << 2) | (1 << 10))        @ Double-edged, enabled
          str r2, [r1, #PWM_to_PCR]

          bx LR

.globl DMAIRQHandler
.thumb_func
DMAIRQHandler:
          movw r0, #:lower16:(DMA_base + DMA_to_IntTCClear)
          movt r0, #:upper16:(DMA_base + DMA_to_IntTCClear)
          movs r1, #1                   @ Channel #0
          str r1, [r0]                  @ Clear interrupt flag.

          movw r0, #:lower16:vga_base   @ Get vga_base in r0.
          movt r0, #:upper16:vga_base   @ (Required for all state functions.)

          ldr r1, [r0, #vga_to_line]    @ Load line count into r1,
          adds r1, #1                   @ increment it,
          str r1, [r0, #vga_to_line]    @ and write it back.

          @
          @ Now we have:
          @  r0: vga_base
          @  r1: vga_line
          
          @ Tail-call the state handler.
          ldr r2, [r0, #vga_to_next]
          bx r2

@
@ Driver state machine
@
@ At each step, the DMA IRQ handler (above) calls the next function in the
@ state machine.  These functions are described below.

@ vgas_before_vsync
@ Hangs out for a couple of lines, waiting for the start of vsync.
@ At the right moment, pulls the vsync line low.
.thumb_func
vgas_before_vsync:
          cmp r1, #2                    @ Are we at the start of line 2?
          beq 1f                        @ If so, skip ahead.
          bx LR                         @ Otherwise, return.

1:        @ Vsync Low
          @ GPIO port 0 clear address into r0.
          movw r2, #:lower16:0x2009C01C
          movt r2, #:upper16:0x2009C01C
          movs r1, #(1 << 6)            @ Turn on P0.6.
          str r1, [r2]

          @ Reinitialize some driver state
          ldr r1, [r0, #vga_to_text]
          str r1, [r0, #vga_to_text_pos]

          @ -> vgas_during_vsync
          adr r2, vgas_during_vsync+1   @ Generate thumb address.
          str r2, [r0, #vga_to_next]    @ Store it in vector.
          bx LR

@ vgas_during_vsync
@ Waits out the vsync period (2 lines).
@ At the right moment, pulls the vsync line high.
.thumb_func
vgas_during_vsync:
          cmp r1, #4                    @ Are we at the start of line 4?
          beq 1f                        @ If so, skip ahead.
          bx LR                         @ Otherwise, return.

1:        @ Vsync High
          @ GPIO port 0 set address into r0.
          movw r2, #:lower16:0x2009C018
          movt r2, #:upper16:0x2009C018
          movs r1, #(1 << 6)
          str r1, [r2]

          @ -> vgas_after_vsync
          adr r2, vgas_after_vsync+1    @ Generate thumb address.
          str r2, [r0, #vga_to_next]    @ Store it in vector.
          bx LR

@ vgas_after_vsync
@ Waits out the blank period before active video begins.
.thumb_func
vgas_after_vsync:
          cmp r1, #79                   @ Are we at line 79?
          beq 1f                        @ If so, skip ahead.
          bx LR                         @ Otherwise, return.

          @ -> vgas_active_video
1:        adr r2, vgas_active_video+1   @ Generate thumb address.
          str r2, [r0, #vga_to_next]    @ Store it in vector.
          bx LR

@ vgas_active_video
@ The real meat of the driver.  While one line buffer is being drawn,
@ copies pixels into the other line buffer.
.thumb_func
vgas_active_video:
          @ Load and exchange buffers.
          ldr r2, [r0, #vga_to_buf]     @ Load the current buffer into r2.

          adds r3, r0, #(data_buf0 - vga_base) @ (r3 = data_buf0)
          cmp r2, r3                    @ Are we currently on data_buf0?
          ite eq                        @ If so,
          addeq r3, r2, #lbuf_size      @ add 100 bytes to make data_buf1;
          subne r3, r2, #lbuf_size      @ otherwise, subtract to data_buf0.
          str r3, [r0, #vga_to_buf]     @ Write it back.

          @ State logic
          mov r3, #480                  @ Are we at the start of line 480?
          cmp r1, r3
          beq 1f                        @ If so, skip ahead.

          push {r4, r5, r6, r7}         @ Free up some registers.
          and r1, #0xF                  @ Convert line number into glyph line.
          ldr r4, [r0, #vga_to_text_pos]@ Start from beginning of current line.

          cmp r1, #0xF                  @ Are we on the last line of the glyph?
          itt eq                        @ If so,
          addeq r5, r4, #80             @ compute the next line,
          streq r5, [r0, #vga_to_text_pos]        @ and store it back.

          ldr r7, [r0, #vga_to_state]   @ Load state.
          tst r7, #1                    @ Is the cursor hidden?
          it eq                         @ If not,
          ldreq r7, [r0, #vga_to_cursor_pos]  @ Load cursor position
          subs r7, r4                   @ Convert to character-within-line,
          adds r7, #1                   @ but add 1 so we can just subtract.

          @ Glyphs are stored by slice -- i.e. all slice 0s, followed by
          @ all slice 1s, etc.  Thus, we can compute slice_base by adding
          @ (line & 0xF) * 128 to the font_base.
          ldr r3, [r0, #vga_to_font]    @ Load the current font pointer.
          add r1, r3, r1, LSL #7        @ Get the address of the glyph slice.

          adds r2, #16                  @ Move past left blank region in buf.
          movs r3, #80                  @ Set up counter.

          @ Render loop.  On each iteration we render slices of two characters,
          @ to compensate for odd byte ordering during I2S transmission.
          @ r0  vga_base
          @ r1  glyph base
          @ r2  line buffer position
          @ r3  character counter
          @ r4  text position
          @ r5  temporary
          @ r6  temporary
          @ r7  cursor position
3:        ldrb r5, [r4, #1]             @ Load odd character.
          ldrb r6, [r4], #2             @ Load even character, advance.

          ldrb r5, [r1, r5]             @ Load odd glyph slice.
          ldrb r6, [r1, r6]             @ Load even glyph slice.
          subs r7, #1                   @ Decrement the cursor counter.
          it eq                         @ If it reached zero,
          mvneq r6, r6                  @ invert the even slice.

          subs r7, #1                   @ Decrement the cursor counter.
          it eq                         @ If it reached zero,
          mvneq r5, r5                  @ invert the odd slice.

          bfi r5, r6, #8, #8            @ Combine in memory order (r5 then r6)
          strh r5, [r2], #2             @ Write them out.

          subs r3, #2                   @ Decrement counter.
          beq 2f                        @ Escape when counter runs out.

          b 3b                          @ Continue otherwise.

2:        pop {r4, r5, r6, r7}          @ Un-spill.
          bx LR                         @ Return

1:        @ -> vgas_after_video
          adr r2, vgas_after_video+1    @ Generate thumb address.
          str r2, [r0, #vga_to_next]    @ Store it in vector.
          bx LR

@ vgas_after_video
@ Waits for a couple lines, clearing line buffers as they become unused.
.thumb_func
vgas_after_video:
          movw r2, #484                 @ Are we at the start of line 483?
          cmp r1, r2
          beq 1f                        @ If so, skip ahead.

          @ Clear the current buffer.
          ldr r2, [r0, #vga_to_buf]

          mov r3, #100
          mov r1, #0
3:        subs r3, #1
          strb r1, [r2], #1
          beq 2f
          b 3b

2:        @ Exchange buffers
          add r3, r0, #(data_buf1 - vga_base)
          cmp r3, r2
          it ne
          subne r2, #200
          str r2, [r0, #vga_to_buf]
          bx LR

1:        @ -> vgas_tail
          adr r2, vgas_tail+1   @ Generate thumb address.
          str r2, [r0, #vga_to_next]    @ Store it in vector.
          bx LR


@ vgas_tail
@ Waits out the blank period at the bottom of the display.
.thumb_func
vgas_tail:
          movw r2, #524                 @ Are we at the start of line 524?
          cmp r1, r2
          beq 1f                        @ If so, skip ahead.

          ldr r2, [r0, #vga_to_buf]

          mov r3, #100
          movs r1, #0
3:        subs r3, #4
          str r1, [r2], #4
          beq 2f
          b 3b

2:        
          bx LR


1:        @ Reset the line counter to 0.
          mov r1, #0
          str r1, [r0, #vga_to_line]

          @ -> vgas_before_vsync
          adr r2, vgas_before_vsync+1   @ Generate thumb address.
          str r2, [r0, #vga_to_next]    @ Store it in vector.
          bx LR

@
@ DMA Buffer Cycle
@ To ensure a continuous feed of data to the I2S unit, we use the DMA unit's
@ gather function.  This consumes bytes from an arbitrary number of buffers,
@ connected in a linked list; each node is called a Linked List Item, or LLI.
@ Here, we construct two LLIs and link them into a circular list.  After setting
@ this once, we let the DMA unit run unattended.

.section .rodata
.p2align 2
dma_lli0:
          .word data_buf0     @ Source: line buffer 0
          .word I2STXFIFO     @ Dest: I2S transmit queue
          .word dma_lli1      @ Next: dma_lli1
          .word 0x84489019    @ Control word, derived as follows:
          @ ( (100 / 4)   @ 100 bytes/transfer in word increments
          @       | (1 << 12)   @ 1-word source burst
          @       | (1 << 15)   @ 1-word destination burst
          @       | (2 << 18)   @ source is word-sized
          @       | (2 << 21)   @ dest is word-sized
          @       | (1 << 26)   @ source address should increment
          @       | (0 << 27)   @ dest address shouldn't.
          @       | (1 << 31)   @ interrupt on terminal count
          @       )

dma_lli1:
          .word data_buf1     @ Source: line buffer 1
          .word I2STXFIFO     @ Dest: I2S transmit queue
          .word dma_lli0      @ Next: dma_ll0
          .word 0x84489019    @ Control word, derived as follows:
          @ ( (100 / 4)   @ 100 bytes/transfer in word increments
          @       | (1 << 12)   @ 1-word source burst
          @       | (1 << 15)   @ 1-word destination burst
          @       | (2 << 18)   @ source is word-sized
          @       | (2 << 21)   @ dest is word-sized
          @       | (1 << 26)   @ source address should increment
          @       | (0 << 27)   @ dest address shouldn't.
          @       | (1 << 31)   @ interrupt on terminal count
          @       )