#lang racket
(require plot pict)

(define (random-distribution labels probs)
  (let/ec do
    (for/fold ([point (random)]) ([label labels] [prob probs])
      (if (<= point prob)
          (do label)
          (- point prob)))))

(define (threshold-vector! v)
  (define threshold (apply max (vector->list v)))
  (for ([x v] [i (in-naturals)]) (vector-set! v i (- x threshold))))

(define (vector-sum-lg v)
  (for/fold ([sum 0]) ([x v]) (+ sum (exp x))))

;;; AGENTS AND BANDITS

;; We implement both regular *and* blind no-regret dynamics.

;; Agents use regular no-regret dynamics, so they choose an action,
;; observe what would have happened had they chosen differently, and
;; adapt over time.

(define *rate* 0.1)

(struct agent (actions utilities scores))

(define/contract (make-agent actions utility)
  (-> (listof symbol?) (-> (listof symbol?) number?) agent?)
  (agent actions utility (for/vector ([action actions]) 0.0)))

(define (agent-probabilities ag)
  (match-define (agent actions _ scores) ag)
  (define sum (for/fold ([sum 0]) ([score scores]) (+ sum (exp score))))
  (for/vector ([score scores]) (/ (exp score) sum)))

(define (agent-choose ag)
  (match-define (agent actions _ scores) ag)
  (random-distribution actions (agent-probabilities ag)))

(define (agent-update! ag rate i done)
  (match-define (agent actions utility scores) ag)
  (define utilities
    (for/vector ([action actions])
      (utility (list-set done i action))))
  (for ([score scores] [delta utilities] [i (in-naturals)])
    (vector-set! scores i (+ score (* rate delta))))
  (threshold-vector! scores))

;; Bandits use blind no-regret dynamics, so they choose an action,
;; observe only the reward for that action, and adapt over time. The
;; algorithm is from "Prediction, Learning, and Games" by Nicolo
;; Cesa-Bianchi and Gabor Lugosi
;; <http://www.ii.uni.wroc.pl/~lukstafi/pmwiki/uploads/AGT/Prediction_Learning_and_Games.pdf>

(struct bandit (beta eta gamma labels weights probs choice) #:mutable)

(define (make-bandit n labels)
  (define N (length labels))
  (define delta (max (/ N (exp (/ n (* 2 N)))) 0.05))
  (define beta (sqrt (* (/ 1. (* n N)) (log (/ N delta)))))
  (define gamma (/ (* 4 N beta) (+ 3 beta)))
  (define eta (/ gamma (* 2 N)))

  (eprintf "δ = ~a, β = ~a, γ = ~a, η = ~a\n" delta beta gamma eta)
  (bandit beta gamma eta labels
          (build-vector N (const 1)) (build-vector N (const (/ 1. N))) #f))

(define (bandit-choose bandit)
  (define choice
    (random-distribution (bandit-labels bandit) (bandit-probs bandit)))
  (set-bandit-choice! bandit choice)
  choice)

(define (bandit-update! b reward)
  (match-define (bandit beta eta gamma labels weights probs choice) b)
  (for ([label labels] [prob probs] [weight weights] [i (in-naturals)])
    (define gain
      (if (equal? label choice) (/ (+ reward beta) prob) (/ beta prob)))
    (vector-set! weights i (+ weight (* eta gain))))
  (threshold-vector! weights)
  (define sum (vector-sum-lg weights))
  (for ([w weights] [i (in-naturals)])
    (vector-set! probs i (+ (* (- 1 gamma) (/ (exp w) sum)) (/ gamma (length labels))))))

;;; Games store a set of actions and per-agent utilities

(struct game (actions utilities))

(define (make-game actions utility)
  (game
   actions
   (for/list ([action actions] [i (in-naturals)])
     (compose (curryr list-ref i) (curry apply utility)))))

(define pd-actions '((C D) (C D)))

(define/match (pd-utility a b)
  [('C 'C) '(5 5)]
  [('D 'C) '(6 0)]
  [('C 'D) '(0 6)]
  [('D 'D) '(1 1)])

(define pd-game (make-game pd-actions pd-utility))

(define/match (pd-utility2 a b)
  [('C 'C) '(5 5)]
  [('D 'C) '(2 0)]
  [('C 'D) '(0 2)]
  [('D 'D) '(1 1)])

(define pd-game2 (make-game pd-actions pd-utility2))

;;; GAMES, NORMAL AND MORAL

;; Normal games are run by instantiating bandits to choose actions
;; directly. These then converge to an equilibrium of some sort

(define (run-game gm #:steps [steps 250])
  (match-define (game actions utilities) gm)
  (define bandits (map (curry make-bandit steps) actions))
  (for/list ([step (in-range steps)])
    (define choices (map bandit-choose bandits))
    (define rewards (map (λ (f) (f choices)) utilities))
    (for-each bandit-update! bandits rewards)
    (map list choices)))

;; A moral game has multiple plausible utilities, including a
;; distinguished "original" utility. It instantiates several agents,
;; one per plausible utility, for each player, and also a bandit to
;; choose among these players.

(define pd-util+
  (list (list (curry apply (compose first pd-utility2))
              (curry apply (compose second pd-utility2)))))

(define (run-moral-game gm utils+ #:steps [steps 250])
  (match-define (game actions utilities) gm)
  (define agents ; One agent per possible utility function
    (map (curry map make-agent actions)
         (apply map list (cons utilities utils+))))
  (define bandits (map (curry make-bandit steps) agents))
  (for/list ([step (in-range steps)])
    (define subagents (map bandit-choose bandits))
    (define idxs
     (for/list ([bandit bandits] [subagent subagents])
       (index-of (bandit-labels bandit) subagent)))
    (define choices (map agent-choose subagents))
    (define rewards (map (λ (f) (f choices)) utilities))
    (for-each bandit-update! bandits rewards)
    (for ([bandit bandits] [subagent subagents] [reward rewards] [i (in-naturals)])
      (define prob
        (vector-ref (bandit-probs bandit) (index-of (bandit-labels bandit) subagent)))
      (agent-update! subagent (/ *rate* prob) i choices))
    (map list choices idxs)))

;; This simple function plots the behavior of the agents

(define (plot-results gm dones tmpl)
  (define actionss
    (for/list ([_ (first dones)] [i (in-naturals)])
      (sort
       (remove-duplicates (append-map (curryr list-ref i) dones))
       string<?
       #:key ~a)))
  (parameterize ([plot-x-axis? #f] [plot-x-far-axis? #f])
    (for/list ([actions actionss] [i (in-naturals)])
      (plot-file
       #:height 150
       #:x-label #f
       #:y-label #f
       (for/list ([action actions] [k (in-naturals)])
         (lines
          #:label (~a action)
          #:color (+ k 1)
          (for/vector ([j (in-naturals)] [done dones])
            (vector (+ j 1) (/ (count (λ (x) (set-member? (list-ref x i) action)) (take dones (+ j 1))) (+ j 1))))))
       (format tmpl i)))))

#|

TRY IT:

(plot-results pd-game (run-game pd-game))

(plot-results pd-game (run-moral-game pd-game pd-util+))

|#