Q-обучение в игре не работает должным образом

Я попытался реализовать Q-обучение в простой игре, которую я написал. Игра основана на том, что игрок должен "прыгать", чтобы избежать встречных ящиков.

Я разработал систему с двумя действиями; jump и do_nothing, а состояния - это расстояния от следующего блока (разделенные и перекрытые, чтобы гарантировать, что не существует большого количества состояний).

Моя проблема заключается в том, что моя реализация алгоритма не рассматривает "будущую награду", и поэтому она заканчивается прыжками в неправильные времена.

Вот моя реализация алгоритма Q-обучения,

JumpGameAIClass.prototype.getQ = function getQ(state) {
    if (!this.Q.hasOwnProperty(state)) {
        this.Q[state] = {};

        for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
            var action = this.actions[actionIndex];

            this.Q[state][action] = 0;
        }
    }

    return this.Q[state];
};

JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
    var closest = -1;

    for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
        var block = this.blocks[blockIndex];

        var distance = block.x - this.playerX;

        if (distance >= 0 && (closest === -1 || distance < closest)) {
            closest = distance;
        }
    }

    return Math.max(0, Math.floor(closest * this.resolution));
};

JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
    var jumpReward = this.getQ(distance)[this.actions[0]];
    var doNothingReward = this.getQ(distance)[this.actions[1]];

    if (jumpReward > doNothingReward) {
        return this.actions[0];
    } else if (doNothingReward > jumpReward) {
        return this.actions[1];
    } else {
        if (!this.canJump()) {
            return this.actions[1];
        }

        return this.actions[Math.floor(Math.random() * this.actions.length)];
    }
};

JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
    // We can't jump while in mid-air
    if (!this.canJump()) {
        return this.actions[1];
    }

    if (Math.random() < this.epsilon) {
        return this.actions[Math.floor(Math.random() * this.actions.length)];
    } else {
        return this.getActionWithHighestQ(this.getBlockDistance());
    }
};

JumpGameAIClass.prototype.think = function think() {
    var reward = this.liveReward;

    if (this.score !== this.lastScore) {
        this.lastScore = this.score;
        reward = this.scoreReward;
    } else if (!this.playerAlive) {
        reward = this.deathReward;
    }

    this.drawDistance();

    var distance = this.getBlockDistance(),
        maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],
        previousQ = this.getQ(this.lastDistance)[this.lastAction];

    this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);

    this.lastAction = this.getActionEpsilonGreedy();
    this.lastDistance = distance;

    switch (this.lastAction) {
        case this.actions[0]:
            this.jump();
            break;
    }
};

И вот некоторые из его свойств:

epsilon: 0.05,
alpha: 1,
gamma: 1,
resolution: 0.1,
actions: [ 'jump', 'do_nothing' ],
Q: {},
liveReward: 0,
scoreReward: 100,
deathReward: -1000,
lastAction: 'do_nothing',
lastDistance: 0,
lastScore: 0

Мне нужно использовать lastAction/lastDistance для вычисления Q, поскольку я не могу использовать текущие данные (будет действовать на действие, выполняемое в кадре раньше).

Метод think вызывается один раз в каждом кадре после выполнения всех операций рендеринга и игры (физика, элементы управления, смерть и т.д.).

var JumpGameAIClass = function JumpGame(canvas) {
    Game.JumpGame.call(this, canvas);

    Object.defineProperties(this, {
        epsilon: {
            value: 0.05
        },

        alpha: {
            value: 1
        },

        gamma: {
            value: 1
        },

        resolution: {
            value: 0.1
        },

        actions: {
            value: [ 'jump', 'do_nothing' ]
        },

        Q: {
            value: { },
            writable: true
        },

        liveReward: {
            value: 0
        },

        scoreReward: {
            value: 100
        },

        deathReward: {
            value: -1000
        },

        lastAction: {
            value: 'do_nothing',
            writable: true
        },

        lastDistance: {
            value: 0,
            writable: true
        },

        lastScore: {
            value: 0,
            writable: true
        }
    });
};

JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);

JumpGameAIClass.prototype.getQ = function getQ(state) {
    if (!this.Q.hasOwnProperty(state)) {
        this.Q[state] = {};

        for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
            var action = this.actions[actionIndex];

            this.Q[state][action] = 0;
        }
    }

    return this.Q[state];
};

JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
    var closest = -1;

    for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
        var block = this.blocks[blockIndex];

        var distance = block.x - this.playerX;

        if (distance >= 0 && (closest === -1 || distance < closest)) {
            closest = distance;
        }
    }

    return Math.max(0, Math.floor(closest * this.resolution));
};

JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
    var jumpReward = this.getQ(distance)[this.actions[0]];
    var doNothingReward = this.getQ(distance)[this.actions[1]];

    if (jumpReward > doNothingReward) {
        return this.actions[0];
    } else if (doNothingReward > jumpReward) {
        return this.actions[1];
    } else {
        if (!this.canJump()) {
            return this.actions[1];
        }

        return this.actions[Math.floor(Math.random() * this.actions.length)];
    }
};

JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
    if (!this.canJump()) {
        return this.actions[1];
    }

    if (Math.random() < this.epsilon) {
        return this.actions[Math.floor(Math.random() * this.actions.length)];
    } else {
        return this.getActionWithHighestQ(this.getBlockDistance());
    }
};

JumpGameAIClass.prototype.onDeath = function onDeath() {
    this.restart();
};

JumpGameAIClass.prototype.think = function think() {
    var reward = this.liveReward;

    if (this.score !== this.lastScore) {
        this.lastScore = this.score;
        reward = this.scoreReward;
    } else if (!this.playerAlive) {
        reward = this.deathReward;
    }

    this.drawDistance();

    var distance = this.getBlockDistance(),
        maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],
        previousQ = this.getQ(this.lastDistance)[this.lastAction];

    this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);

    this.lastAction = this.getActionEpsilonGreedy();
    this.lastDistance = distance;

    switch (this.lastAction) {
        case this.actions[0]:
            this.jump();
            break;
    }
};

JumpGameAIClass.prototype.drawDistance = function drawDistance() {
    this.context.save();

    this.context.textAlign = 'center';
    this.context.textBaseline = 'bottom';

    this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);

    this.context.textBaseline = 'top';

    this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);

    this.context.restore();
};

JumpGameAIClass.prototype.onFrame = function onFrame() {
    Game.JumpGame.prototype.onFrame.apply(this, arguments);

    this.think();
}

Game.JumpGameAI = JumpGameAIClass;

body {
    background-color: #EEEEEE;
    text-align: center;
}

canvas#game {
    background-color: #FFFFFF;
    border: 1px solid #DDDDDD;
}

<!DOCTYPE HTML>
<html lang="en">
<head>
    <title>jump</title>
</head>
<body>
    <canvas id="game" width="512" height="512">
        <h1>Your browser doesn't support canvas!</h1>
    </canvas>
  
    <script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>
  
    <!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->
  
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script>
</body>

Ответ 1

У вас в основном есть упрощенная версия:

Источник: Flappy Bird RL

Я использовал значения:

    epsilon: {
        value: 0.01
    },
    alpha: {
        value: 0.7
    },
    gamma: {
        value: 0.9
    },
    resolution: {
        value: 0.1
    },  
    liveReward: {
        value: 10
    },
    scoreReward: {
        value: -100
    },
    deathReward: {
        value: 1000
    },

Не было проблем с выходом за пределы 100 в первые 20 попыток.

Q-обучение может быть описано с помощью временной логики

Q(s, a)=r(s,a)+gamma*max_a'(Q(s', a'))

Где

r(s,a)= r= Немедленное вознаграждение
gamma= относительное значение отсроченного или немедленного вознаграждения (от 0 до 1)
s'= новое состояние после действия a
a= действие в состоянии s
a'= действие в состоянии s'

Вы должны выполнить его как

Выберите действие a и выполните его

Для каждой пары состояния (s, a) инициализируйте запись таблицы Q (s, a) до нуля
Наблюдать текущее состояние s
Делайте навсегда:
- Выберите действие a и выполните его
- Получить немедленную награду r aka Q (s, a)
- Обратите внимание на новое состояние s
- Обновить запись в таблице для Q (s, a) = r (s, a) + gamma * max_a '(Q (s', a '))
- S = S'

Ответ 2

Ваша реализация алгоритма в порядке, просто нужно настроить некоторые параметры.

Если вы назначаете какую-то награду за жизнь, 10 в моем примере и установите epsilon на 0, вы получите искусственный ИИ.

Пример:

var JumpGameAIClass = function JumpGame(canvas) {
    Game.JumpGame.call(this, canvas);

    Object.defineProperties(this, {
        epsilon: {
            value: 0
        },

        alpha: {
            value: 1
        },

        gamma: {
            value: 1
        },

        resolution: {
            value: 0.1
        },

        actions: {
            value: [ 'jump', 'do_nothing' ]
        },

        Q: {
            value: { },
            writable: true
        },

        liveReward: {
            value: 0
        },

        scoreReward: {
            value: 100
        },

        deathReward: {
            value: -1000
        },

        lastAction: {
            value: 'do_nothing',
            writable: true
        },

        lastDistance: {
            value: 0,
            writable: true
        },

        lastScore: {
            value: 0,
            writable: true
        }
    });
};

JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);

JumpGameAIClass.prototype.getQ = function getQ(state) {
    if (!this.Q.hasOwnProperty(state)) {
        this.Q[state] = {};

        for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
            var action = this.actions[actionIndex];

            this.Q[state][action] = 0;
        }
    }

    return this.Q[state];
};

JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
    var closest = -1;

    for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
        var block = this.blocks[blockIndex];

        var distance = block.x - this.playerX;

        if (distance >= 0 && (closest === -1 || distance < closest)) {
            closest = distance;
        }
    }

    return Math.max(0, Math.floor(closest * this.resolution));
};

JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
    var jumpReward = this.getQ(distance)[this.actions[0]];
    var doNothingReward = this.getQ(distance)[this.actions[1]];
    
    if (!this.canJump()) {
        return this.actions[1];
    } else if (jumpReward > doNothingReward) {
        return this.actions[0];
    } else if (doNothingReward > jumpReward) {
        return this.actions[1];
    } else {   
        return this.actions[Math.floor(Math.random() * this.actions.length)];
    }
};

JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
    if (!this.canJump()) {
        return this.actions[1];
    }

    if (Math.random() < this.epsilon) {
        return this.actions[Math.floor(Math.random() * this.actions.length)];
    } else {
        return this.getActionWithHighestQ(this.getBlockDistance());
    }
};

JumpGameAIClass.prototype.onDeath = function onDeath() {
    this.restart();
};

JumpGameAIClass.prototype.think = function think() {
    var reward = this.liveReward;

    if (this.score !== this.lastScore) {
        this.lastScore = this.score;
        reward = this.scoreReward;
    } else if (!this.playerAlive) {
        reward = this.deathReward;
    }

    this.drawDistance();

    var distance = this.getBlockDistance(),
        maxQ = this.playerAlive ? this.getQ(distance)[this.getActionWithHighestQ(distance)] : 0,
        previousQ = this.getQ(this.lastDistance)[this.lastAction];

    this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);

    this.lastAction = this.getActionEpsilonGreedy();
    this.lastDistance = distance;

    switch (this.lastAction) {
        case this.actions[0]:
            this.jump();
            break;
    }
};

JumpGameAIClass.prototype.drawDistance = function drawDistance() {
    this.context.save();

    this.context.textAlign = 'center';
    this.context.textBaseline = 'bottom';

    this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);

    this.context.textBaseline = 'top';

    this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);

    this.context.restore();
};

JumpGameAIClass.prototype.onFrame = function onFrame() {
    Game.JumpGame.prototype.onFrame.apply(this, arguments);

    this.think();
}

Game.JumpGameAI = JumpGameAIClass;

body {
    background-color: #EEEEEE;
    text-align: center;
}

canvas#game {
    background-color: #FFFFFF;
    border: 1px solid #DDDDDD;
}

<!DOCTYPE HTML>
<html lang="en">
<head>
    <title>jump</title>
</head>
<body>
    <canvas id="game" width="512" height="512">
        <h1>Your browser doesn't support canvas!</h1>
    </canvas>
  
    <script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>
  
    <!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->
  
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script>
</body>