summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: d2e53c4)
raw | patch | inline | side by side (parent: d2e53c4)
author | borisgin <boris.ginsburg@gmail.com> | |
Sat, 15 Jul 2017 21:44:27 +0000 (14:44 -0700) | ||
committer | borisgin <boris.ginsburg@gmail.com> | |
Sat, 15 Jul 2017 21:44:27 +0000 (14:44 -0700) |
index ddc84ebf72d2c1e5dd4a167d79262112cacbe672..df4e5a17e8e9f4463f5671ac48246ee45853e9d3 100644 (file)
void PreSolve();
float GetLearningRate();
float GetMomentum();
+
+ float getLocalRate(int param_id) const;
float local_decay(int param_id) const;
- void ApplyUpdate(int param_id, void* handle, bool clear_grads) override;
+ void ApplyUpdate(int param_id, void* handle, bool clear_grads) override;
virtual void Normalize(int param_id, void* handle);
virtual void Regularize(int param_id, void* handle);
virtual void ComputeUpdateValue(int param_id, void* handle, float rate, bool clear_grads);
virtual void SnapshotSolverStateToHDF5(const string& model_filename);
virtual void RestoreSolverStateFromHDF5(const string& state_file);
virtual void RestoreSolverStateFromBinaryProto(const string& state_file);
+ void PrintParams(int param_id);
+
// history maintains the historical momentum data.
// update maintains update related data and is not needed in snapshots.
// temp maintains other information that might be needed in computation
diff --git a/models/alexnet_bn/solver_autolr.prototxt b/models/alexnet_bn/solver_autolr.prototxt
--- /dev/null
@@ -0,0 +1,36 @@
+net: "models/alexnet_bn/train_val2.prototxt"
+
+test_iter: 195 # 50000/256
+test_interval: 5000
+#test_interval: 80000
+test_initialization: false
+
+display: 100
+
+max_iter: 250000 # 100 epochs
+
+lr_policy: "poly"
+base_lr: 1
+power: 2.
+momentum: 0.9
+weight_decay: 0.0005
+
+local_lr_auto: true
+
+snapshot: 500000
+snapshot_prefix: "models/alexnet_bn/snapshots/alexnet_bn"
+snapshot_after_train: false
+
+solver_mode: GPU
+random_seed: 1
+#debug_info: true
+
+# Train dataset size = 1,281,167
+# Test dataset size = 50,000
+
+# batch 64 --> epoch = 20,000
+# batch 96 --> epoch = 15,000
+# batch 128 --> epoch = 10,000
+# batch 256 --> epoch = 5,000
+# batch 512 --> epoch = 2,500 #
+# batch 1024--> epoch = 1,250
index 3f653cdc604f3953a2deb4795387e25f7568faaa..e6d350169bd198ddbf41510225dc04077ddde6c8 100755 (executable)
#!/usr/bin/env sh
./build/tools/caffe train \
- --solver=models/alexnet_bn/solver.prototxt -gpu=all \
- 2>&1 | tee models/alexnet_bn/logs/alexnet_bn_base2_lr0.08_wd0.0005_l4.log
+ --solver=models/alexnet_bn/solver_autolr.prototxt -gpu=all \
+ 2>&1 | tee models/alexnet_bn/logs/alexnet_bn_autolr.log
+
index a1c50bc93262ffc7b2a2f5b917c40fb8532921d1..94685bbde47f28c1e08c962dc5424a6d75ddcb95 100644 (file)
}
bias_filler {
type: "constant"
- value: 0
+ value: 0.01
}
}
}
}
bias_filler {
type: "constant"
- value: 0
+ value: 0.01
}
}
}
}
bias_filler {
type: "constant"
- value: 0
+ value: 0.01
}
}
}
}
bias_filler {
type: "constant"
- value: 0
+ value: 0.01
}
}
}
}
bias_filler {
type: "constant"
- value: 0.1
+ value: 0.01
}
}
}
}
bias_filler {
type: "constant"
- value: 0
+ value: 0.01
}
}
}
index 7656488a6b93a1c602fd32434d35a594dba85705..6e711d0a9587ca0e535b7b6b895b234fcc3a976d 100644 (file)
// NOTE
// Update the next available ID when you add a new SolverParameter field.
//
-// SolverParameter next available ID: 49 (last added: momentum_power)
+// SolverParameter next available ID: 50 (last added: local_lr_auto)
message SolverParameter {
//////////////////////////////////////////////////////////////////////////////
// Specifying the train and test networks
optional string momentum_policy = 46 [default = "fixed"];
optional float max_momentum = 47 [default = 0.99];
optional float momentum_power = 48 [default = 1.];
+
+ optional bool local_lr_auto = 49 [default = false];
optional float weight_decay = 12; // The weight decay.
// regularization types supported: L1 and L2
index 7357757afdc4a202f19679cfe32cd33b6b703661..b4d1ca2d74e0c28a2d51ffdedab99199ae5c25da 100644 (file)
} else if (lr_policy == "sigmoid") {
rate = this->param_.base_lr() / (1.F +
exp(-this->param_.gamma() * (double(this->iter_ - this->param_.stepsize()))));
- } else {
+ }
+// else if (lr_policy == "auto") {
+// rate = 1.0;
+// }
+ else {
LOG(FATAL) << "Unknown learning rate policy: " << lr_policy;
}
return rate;
template<typename Dtype>
void
SGDSolver<Dtype>::ComputeUpdateValue(int param_id, void* handle, float rate, bool clear_grads) {
+ if (this->param_.debug_info() ) {
+ PrintParams(param_id);
+ }
shared_ptr<Blob> param = this->net_->learnable_params()[param_id];
-
shared_ptr<TBlob<Dtype>> history = history_[param_id];
- const vector<float>& net_params_lr = this->net_->params_lr();
float momentum = GetMomentum();
- float local_rate = rate * net_params_lr[param_id];
+
+// const vector<float>& net_params_lr = this->net_->params_lr();
+// float local_rate = rate * net_params_lr[param_id];
+ float local_rate = rate * getLocalRate(param_id);
+ //LOG(INFO) << "local_rate=" << local_rate;
+
// Compute the update to history, then copy it to the parameter diff.
if (Caffe::mode() == Caffe::CPU) {
caffe_cpu_axpby<Dtype>(param->count(), local_rate, param->cpu_diff<Dtype>(), momentum,
@@ -251,34 +261,62 @@ SGDSolver<Dtype>::ComputeUpdateValue(int param_id, void* handle, float rate, boo
}
}
+template<typename Dtype>
+float SGDSolver<Dtype>::getLocalRate(int param_id) const {
+ const vector<float>& net_params_lr = this->net_->params_lr();
+ float local_lr = net_params_lr[param_id];
+
+ if (this->param_.local_lr_auto()) {
+ const int layer_id = this->net_->param_layer_indices(param_id).first;
+ shared_ptr<Blob> param = this->net_->learnable_params()[param_id];
+ float w_norm = std::sqrt(param->sumsq_data());
+ float wgrad_norm = std::sqrt(param->sumsq_diff());
+// shared_ptr<TBlob<Dtype>> history = history_[param_id];
+// float h_norm = std::sqrt(history->sumsq_data());
+ float ratio = 1.;
+ if (wgrad_norm > 0.) {
+ ratio = 0.001 * w_norm / wgrad_norm;
+ }
+// LOG(INFO) << "ratio=" << ratio;
+ if (local_lr > 0.) {
+ local_lr = ratio;
+ }
+ if ( Caffe::root_solver() && this->param_.display() && (this->iter_ % this->param_.display() == 0)) {
+ const string& layer_name = this->net_->layer_names()[layer_id];
+// const string& layer_type = this->net_->layers()[layer_id]->type();
+ const int blob_id = this->net_->param_layer_indices(param_id).second;
+ LOG(INFO) << layer_name <<"."<< blob_id << " lr=" << local_lr;
+ }
+ }
+
+ return local_lr;
+}
+
template<typename Dtype>
float SGDSolver<Dtype>::local_decay(int param_id) const {
const vector<float>& net_params_weight_decay = this->net_->params_weight_decay();
float weight_decay = this->param_.weight_decay() * net_params_weight_decay[param_id];
- const std::string& regularization_type = this->param_.regularization_type();
- //FIXME: BG
- if (regularization_type == "L2_unitary") {
+ return weight_decay;
+}
+
+template<typename Dtype>
+void SGDSolver<Dtype>::PrintParams(int param_id) {
+ if ( Caffe::root_solver() && this->param_.display() && (this->iter_ % this->param_.display() == 0)) {
const int layer_id = this->net_->param_layer_indices(param_id).first;
const int blob_id = this->net_->param_layer_indices(param_id).second;
const string& layer_name = this->net_->layer_names()[layer_id];
const string& layer_type = this->net_->layers()[layer_id]->type();
- float factor = 1.;
- if ( (layer_type == "Convolution") && (blob_id==0) ) {
- shared_ptr<Blob> param = this->net_->learnable_params()[param_id];
- float w_norm = param->sumsq_data();
- if (w_norm > 0.) {
-// factor = 1. - 1./(w_norm * w_norm);
- factor = w_norm * w_norm - 1;
- }
- if ( Caffe::root_solver() && this->param_.display() && (this->iter_ % this->param_.display() == 0)) {
- //LOG(INFO) << "L2_unitary: " << layer_id <<"."<< blob_id << " " << layer_name << " " << factor;
- LOG(INFO) << "L2_unitary: " << layer_name << " " << w_norm;
- }
- }
- return (weight_decay * factor) ;
- } else
- return weight_decay;
+ shared_ptr<Blob> param = this->net_->learnable_params()[param_id];
+ shared_ptr<TBlob<Dtype>> history = history_[param_id];
+ if ((layer_type == "Convolution") || (layer_type == "InnerProduct")) {
+ float w_norm = std::sqrt(param->sumsq_data());
+ float wgrad_norm = std::sqrt(param->sumsq_diff());
+ float h_norm = std::sqrt(history->sumsq_data());
+ DLOG(INFO) << "SGD_update " << layer_name << "." << blob_id
+ << " W=" << w_norm << " \tdW=" << wgrad_norm << " \tH="<< h_norm;
+ }
+ }
}
template<typename Dtype>