00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifndef AlpsKnowledgeBrokerMPI_h_
00024 #define AlpsKnowledgeBrokerMPI_h_
00025
00026 #include <cmath>
00027 #include <iosfwd>
00028
00029 #undef SEEK_SET
00030 #undef SEEK_END
00031 #undef SEEK_CUR
00032 #include "mpi.h"
00033
00034 #include "AlpsEnumProcessT.h"
00035 #include "AlpsKnowledge.h"
00036 #include "AlpsKnowledgeBroker.h"
00037 #include "AlpsParams.h"
00038
00039
00040
00041 class AlpsKnowledgeBrokerMPI : public AlpsKnowledgeBroker {
00042
00043 private:
00045 AlpsKnowledgeBrokerMPI(const AlpsKnowledgeBrokerMPI&);
00046 AlpsKnowledgeBrokerMPI& operator=(const AlpsKnowledgeBrokerMPI&);
00047
00048 protected:
00049
00055 int processNum_;
00056
00058 int hubNum_;
00059
00061 int globalRank_;
00062
00064 MPI_Comm clusterComm_;
00065
00067 MPI_Comm hubComm_;
00068
00070 MPI_Group hubGroup_;
00071
00073 int clusterSize_;
00074
00076 int userClusterSize_;
00077
00079 int clusterRank_;
00080
00082 int* hubRanks_;
00083
00085 int myHubRank_;
00086
00088 int masterRank_;
00089
00091 AlpsProcessType processType_;
00092
00094 AlpsProcessType* processTypeList_;
00095
00097 bool hubWork_;
00098
00100 MPI_Request subTreeRequest_;
00101
00103 MPI_Request solRequestL_;
00104 MPI_Request solRequestR_;
00105
00107 MPI_Request modelKnowRequestL_;
00108 MPI_Request modelKnowRequestR_;
00109
00111 MPI_Request forwardRequestL_;
00112 MPI_Request forwardRequestR_;
00114
00120 double incumbentValue_;
00121
00123 int incumbentID_;
00124
00127 bool updateIncumbent_;
00129
00135 double workQuality_;
00136
00138 double clusterWorkQuality_;
00139
00141 double systemWorkQuality_;
00142
00144 double* hubWorkQualities_;
00145
00148 double* workerWorkQualities_;
00149
00151 double workQuantity_;
00152
00154 double clusterWorkQuantity_;
00155
00157 double systemWorkQuantity_;
00158
00160 double systemWorkQuantityForce_;
00161
00163 double* hubWorkQuantities_;
00164
00167 double* workerWorkQuantities_;
00168
00170 bool* workerReported_;
00171
00173 bool* hubReported_;
00174
00176 bool allHubReported_;
00177
00179 int masterDoBalance_;
00180
00182 int hubDoBalance_;
00183
00185 int* workerNodeProcesseds_;
00186
00188 int clusterNodeProcessed_;
00189
00191 int* hubNodeProcesseds_;
00193
00199 int sendCount_;
00200
00202 int recvCount_;
00203
00206 int clusterSendCount_;
00207
00210 int clusterRecvCount_;
00211
00213 int systemSendCount_;
00214
00216 int systemRecvCount_;
00218
00223 int masterIndexBatch_;
00225
00231 AlpsTimer masterTimer_;
00232
00234 AlpsTimer hubTimer_;
00235
00237 AlpsTimer workerTimer_;
00238
00240 double rampUpTime_;
00241
00243 double rampDownTime_;
00244
00246 double idleTime_;
00247
00249 double msgTime_;
00250
00252 AlpsPsStats psStats_;
00254
00256 bool forceTerminate_;
00257
00259 bool blockTermCheck_;
00260
00262 bool blockHubReport_;
00263
00265 bool blockWorkerReport_;
00266
00268 bool blockAskForWork_;
00269
00271 char *attachBuffer_;
00272
00274 char *largeBuffer_;
00275
00277 char *largeBuffer2_;
00278
00280 char *smallBuffer_;
00281
00284 double masterBalancePeriod_;
00285
00288 double hubReportPeriod_;
00289
00291 int modelGenID_;
00292
00294 int modelGenPos_;
00295
00297 AlpsSubTree* rampUpSubTree_;
00298
00300 int unitWorkNodes_;
00301
00303 int haltSearch_;
00304
00305 protected:
00306
00308 void init();
00309
00316 void masterMain(AlpsTreeNode* root);
00317
00320 void hubMain();
00321
00324 void workerMain();
00326
00328
00329
00330 AlpsReturnStatus doOneUnitWork(int unitWork,
00331 double unitTime,
00332 AlpsExitStatus & exitStatus,
00333 int & numNodesProcessed,
00334 int & depth,
00335 bool & betterSolution);
00336
00338 void processMessages(char *&buffer,
00339 MPI_Status &status,
00340 MPI_Request &request);
00341
00343 void rootInitMaster(AlpsTreeNode* root);
00344 void rootInitHub();
00345 void rootInitWorker();
00346
00348 void spiralMaster(AlpsTreeNode* root);
00349 void spiralHub();
00350 void spiralWorker();
00351
00352
00353
00358 void masterAskHubDonate(int donorID,
00359 int receiverID,
00360 double receiverWorkload);
00361
00363 void hubAskWorkerDonate(int donorID,
00364 int receiverID,
00365 double receiverWorkload);
00366
00368 void updateWorkloadInfo();
00369
00370 virtual int getNumNodeLeftSystem()
00371 { return static_cast<int>(systemWorkQuantity_); }
00372
00374 void donateWork(char*& buf,
00375 int tag,
00376 MPI_Status* status,
00377 int recvID = -1,
00378 double recvWL = 0.0);
00379
00381 void hubAllocateDonation(char*& buf, MPI_Status* status);
00382
00384 void hubBalanceWorkers();
00385
00387 void hubSatisfyWorkerRequest(char*& buf, MPI_Status* status);
00388
00390
00391 void hubReportStatus(int tag, MPI_Comm comm);
00392
00394
00395 void hubUpdateCluStatus(char*& buf, MPI_Status* status, MPI_Comm comm);
00396
00398 void hubsShareWork(char*& buf, MPI_Status* status);
00399
00401 void masterBalanceHubs();
00402
00404
00405 void masterUpdateSysStatus(char*& buf, MPI_Status* status, MPI_Comm comm);
00406
00408 void refreshSysStatus();
00409
00411 void refreshClusterStatus();
00412
00414
00415 void workerReportStatus(int tag, MPI_Comm comm);
00417
00418
00419
00425 void workerAskIndices();
00426
00428 void workerRecvIndices(char *&bufLarge);
00429
00431 void masterSendIndices(char *&bufLarge);
00433
00434
00435
00441 void broadcastModel(const int id, const int source);
00442
00444 void sendIncumbent();
00445
00448 bool unpackSetIncumbent(char*& buf, MPI_Status* status);
00449
00451 void collectBestSolution(int destination);
00452
00455 void tellMasterRecv();
00456
00459
00460 void tellHubRecv();
00461
00466 void packEncoded(AlpsEncoded* enc,
00467 char*& buf,
00468 int& size,
00469 int& position,
00470 MPI_Comm comm);
00471
00473 AlpsEncoded* unpackEncoded(char*& buf,
00474 int& position,
00475 MPI_Comm comm,
00476 int size = -1);
00477
00480
00481 void receiveSizeBuf(char*& buf,
00482 int sender,
00483 int tag,
00484 MPI_Comm comm,
00485 MPI_Status* status);
00486
00489
00490 void receiveRampUpNode(int sender,
00491 MPI_Comm comm,
00492 MPI_Status* status);
00493
00496 void receiveSubTree(char*& buf, int sender, MPI_Status* status);
00497
00499
00500 void sendSizeBuf(char*& buf,
00501 int size,
00502 int position,
00503 const int target,
00504 const int tag,
00505 MPI_Comm comm);
00506
00509
00510 void sendRampUpNode(const int target, MPI_Comm comm);
00511
00514 void sendNodeModelGen(int receiver, int doUnitWork);
00515
00517 bool sendSubTree(const int target, AlpsSubTree*& st, int tag);
00518
00520
00521 void sendFinishInit(const int target, MPI_Comm comm);
00523
00525 void deleteSubTrees();
00526
00527
00528 void forwardModelKnowledge();
00529
00531
00532 void sendModelKnowledge(MPI_Comm comm, int receiver=-1);
00533
00535
00536 void receiveModelKnowledge(MPI_Comm comm);
00537
00542 void incSendCount(const char* how, int s = 1);
00544 void decSendCount(const char* how, int s = 1);
00546 void incRecvCount(const char* how, int s = 1);
00548 void decRecvCount(const char* how, int s = 1);
00550
00552 void masterForceHubTerm();
00553
00555 void hubForceWorkerTerm();
00556
00558 void changeWorkingSubTree(double & changeWorkThreshold);
00559
00561 void sendErrorCodeToMaster(int errorCode);
00562
00564 void recvErrorCode(char *& bufLarge);
00565
00567 void spiralRecvProcessNode();
00568
00570 void spiralDonateNode();
00571
00572 public:
00573
00576 AlpsKnowledgeBrokerMPI()
00577 :
00578 AlpsKnowledgeBroker()
00579 {
00580 init();
00581 }
00582
00584 AlpsKnowledgeBrokerMPI(int argc,
00585 char* argv[],
00586 AlpsModel& model)
00587 :
00588 AlpsKnowledgeBroker()
00589 {
00590 init();
00591 initializeSearch(argc, argv, model);
00592 }
00593
00595 ~AlpsKnowledgeBrokerMPI();
00596
00598 virtual int getProcRank() const { return globalRank_; }
00599
00601 virtual int getMasterRank() const { return masterRank_; }
00602
00604 virtual AlpsProcessType getProcType() const { return processType_; }
00605
00618 void initializeSearch(int argc, char* argv[], AlpsModel& model);
00619
00621 void search(AlpsModel *model);
00622
00630 void rootSearch(AlpsTreeNode* root);
00631
00635 virtual double getIncumbentValue() const {
00636 return incumbentValue_;
00637 }
00639 virtual double getBestQuality() const {
00640 if (globalRank_ == masterRank_) {
00641 if (getNumKnowledges(AlpsKnowledgeTypeSolution) > 0) {
00642 return getBestKnowledge(AlpsKnowledgeTypeSolution).second;
00643 }
00644 else {
00645 return ALPS_OBJ_MAX;
00646 }
00647 }
00648 else {
00649 return ALPS_OBJ_MAX;
00650 }
00651 }
00652
00654 virtual double getBestEstimateQuality() { return systemWorkQuality_; }
00655
00657 virtual void printBestSolution(char* outputFile = 0) const;
00658
00660 virtual void searchLog();
00662
00663
00664
00669 void sendKnowledge(AlpsKnowledgeType type,
00670 int sender,
00671 int receiver,
00672 char *& msgBuffer,
00673 int msgSize,
00674 int msgTag,
00675 MPI_Comm comm,
00676 bool blocking);
00677
00679 void receiveKnowledge(AlpsKnowledgeType type,
00680 int sender,
00681 int receiver,
00682 char *& msgBuffer,
00683 int msgSize,
00684 int msgTag,
00685 MPI_Comm comm,
00686 MPI_Status* status,
00687 bool blocking);
00688
00690 void requestKnowledge(AlpsKnowledgeType type,
00691 int sender,
00692 int receiver,
00693 char *& msgBuffer,
00694 int msgSize,
00695 int msgTag,
00696 MPI_Comm comm,
00697 bool blocking);
00699
00700 };
00701 #endif
00702
00703