@inproceedings{kopanski21planbb, author = {Jan Kopanski and Krzysztof Rzadca}, title = {Plan-based Job Scheduling for Supercomputers with Shared Burst Buffers}, booktitle = {Euro-Par '21 Proceedings: 27th International European Conference on Parallel and Distributed Computing (accepted)}, year = {2021}, keywords = {resource management, resource_management, scheduling, burst_buffer, hpc}, publisher = {Springer}, abstract = {The ever-increasing gap between compute and I/O performance in HPC platforms, together with the development of novel NVMe storage devices (NVRAM), led to the emergence of the burst buffer concept---an intermediate persistent storage layer logically positioned between random-access main memory and a parallel file system. Despite the development of real-world architectures as well as research concepts, resource and job management systems, such as Slurm, provide only marginal support for scheduling jobs with burst buffer requirements, in particular ignoring burst buffers when backfilling. We investigate the impact of burst buffer reservations on the overall efficiency of online job scheduling for common algorithms: First-Come-First-Served (FCFS) and Shortest-Job-First (SJF) EASY-backfilling. We evaluate the algorithms in a detailed simulation with I/O side effects. Our results indicate that the lack of burst buffer reservations in backfilling may significantly deteriorate scheduling We also show that these algorithms can be easily extended to support burst buffers. Finally, we propose a burst-buffer--aware plan-based scheduling algorithm with simulated annealing optimisation, which improves the mean waiting time by over 20\% and mean bounded slowdown by 27\% compared to the burst-buffer--aware SJF-EASY-backfilling.} }
@inproceedings{naruszko21loglinbb, author = {Adrian Naruszko and Bartlomiej Przybylski and Krzysztof Rzadca}, title = {A log-linear (2+5/6)-approximation algorithm for parallel machine scheduling with a single orthogonal resource}, booktitle = {Euro-Par '21 Proceedings: 27th International European Conference on Parallel and Distributed Computing (accepted)}, year = {2021}, keywords = {resource management, resource_management, scheduling, burst_buffer, hpc}, publisher = {Springer}, abstract = {As the gap between compute and I/O performance tends to grow, modern High-Performance Computing (HPC) architectures include a new resource type: an intermediate persistent fast memory layer, called burst buffers. This is just one of many kinds of renewable resources which are orthogonal to the processors themselves, such as network bandwidth or software licenses. Ignoring orthogonal resources while making scheduling decisions just for processors may lead to unplanned delays of jobs of which resource requirements cannot be immediately satisfied. We focus on a classic problem of makespan minimization for parallel-machine scheduling of independent sequential jobs with additional requirements on the amount of a single renewable orthogonal resource. We present an easily-implementable log-linear algorithm that we prove is (2+5/6)-approximation. In simulation experiments, we compare our algorithm to standard greedy list-scheduling heuristics and show that, compared to LPT, resource-based algorithms generate significantly shorter schedules.} }
@inproceedings{bap21datadrivenfaas, author = {Bartomiej Przybylski and Pawel Zuk and Krzysztof Rzadca}, title = {Data-driven scheduling in serverless computing to reduce response time}, booktitle = {CCGrid '21 Proceedings: 21th IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing}, keywords = {resource management, scheduling, resource_management, datacenter, FaaS}, year = {2021}, abstract = {In Function as a Service (FaaS), a serverless computing variant, customers deploy functions instead of complete virtual machines or Linux containers. It is the cloud provider who maintains the runtime environment for these functions. FaaS products are offered by all major cloud providers (e.g. Amazon Lambda, Google Cloud Functions, Azure Functions); as well as standalone open-source software (e.g. Apache OpenWhisk) with their commercial variants (e.g. Adobe I/O Runtime or IBM Cloud Functions). We take the bottom-up perspective of a single node in a FaaS cluster. We assume that all the execution environments for a set of functions assigned to this node have been already installed. Our goal is to schedule individual invocations of functions, passed by a load balancer, to minimize performance metrics related to response time. Deployed functions are usually executed repeatedly in response to multiple invocations made by end-users. Thus, our scheduling decisions are based on the information gathered locally: the recorded call frequencies and execution times. We propose a number of heuristics, and we also adapt some theoretically-grounded ones like SEPT or SERPT. Our simulations use a recently-published Azure Functions Trace. We show that, compared to the baseline FIFO or round-robin, our data-driven scheduling decisions significantly improve the performance.} }
@inproceedings{bashir2021peaklimit, author = {Noman Bashir and Nan Deng and Krzysztof Rzadca and David E. Irwin and Sree Kodak and Rohit Jnagal}, title = {Take it to the limit: peak prediction-driven resource overcommitment in datacenters}, booktitle = {EuroSys '21: Proceedings of the Sixteenth EuroSys Conference}, optbooktitle = {EuroSys '21: Sixteenth European Conference on Computer Systems, Online Event, United Kingdom, April 26-28, 2021}, pages = {556--573}, publisher = {{ACM}}, keywords = {resource management, scheduling, resource_management, datacenter}, year = {2021}, url = {https://doi.org/10.1145/3447786.3456259}, doi = {10.1145/3447786.3456259}, abstract = {To increase utilization, datacenter schedulers often overcommit resources where the sum of resources allocated to the tasks on a machine exceeds its physical capacity. Setting the right level of overcommitment is a challenging problem: low overcommitment leads to wasted resources, while high overcommitment leads to task performance degradation. In this paper, we take a first principles approach to designing and evaluating overcommit policies by asking a basic question: assuming complete knowledge of each task's future resource usage, what is the safest overcommit policy that yields the highest utilization? We call this policy the peak oracle. We then devise practical overcommit policies that mimic this peak oracle by predicting future machine resource usage. We simulate our overcommit policies using the recently-released Google cluster trace, and show that they result in higher utilization and less overcommit errors than policies based on per-task allocations. We also deploy these policies to machines inside Google's datacenters serving its internal production workload. We show that our overcommit policies increase these machines' usable CPU capacity by 10-16\% compared to no overcommitment.} }
@inproceedings{zuk2020schedulingfaas, author = {Pawel Zuk and Krzysztof Rzadca}, title = {Scheduling Methods to Reduce Response Latency of Function as a Service}, booktitle = {32nd {IEEE} International Symposium on Computer Architecture and High Performance Computing, {SBAC-PAD} 2020, Porto, Portugal, September 9-11, 2020}, pages = {132--140}, keywords = {resource management, scheduling, resource_management, datacenter, FaaS}, publisher = {{IEEE}}, year = {2020}, url = {https://arxiv.org/abs/2008.04830}, doi = {10.1109/SBAC-PAD49847.2020.00028}, abstract = {Function as a Service (FaaS) permits cloud customers to deploy to cloud individual functions, in contrast to complete virtual machines or Linux containers. All major cloud providers offer FaaS products (Amazon Lambda, Google Cloud Functions, Azure Serverless); there are also popular open-source implementations (Apache OpenWhisk) with commercial offerings (Adobe I/O Runtime, IBM Cloud Functions). A new feature of FaaS is function composition: a function may (sequentially) call another function, which, in turn, may call yet another function - forming a chain of invocations. From the perspective of the infrastructure, a composed FaaS is less opaque than a virtual machine or a container. We show that this additional information enables the infrastructure to reduce the response latency. In particular, knowing the sequence of future invocations, the infrastructure can schedule these invocations along with environment preparation. We model resource management in FaaS as a scheduling problem combining (1) sequencing of invocations, (2) deploying execution environments on machines, and (3) allocating invocations to deployed environments. For each aspect, we propose heuristics. We explore their performance by simulation on a range of synthetic workloads. Our results show that if the setup times are long compared to invocation times, algorithms that use information about the composition of functions consistently outperform greedy, myopic algorithms, leading to significant decrease in response latency.} }
@inproceedings{rzadca2020autopilot, author = {K. Rzadca and P. Findeisen and J. Swiderski and P. Zych and P. Broniek and J. Kusmierek and P. Nowak and B. Strack and S. Hand and J. Wilkes}, booktitle = {EuroSys'20: Proceedings of the Fifteenth EuroSys Conference}, title = {Autopilot: workload autoscaling at Google}, year = {2020}, publisher = {ACM}, opturl = {https://arxiv.org/abs/1806.08657}, keywords = {vertical autoscaling, machine learning, resource management, scheduling, resource_management, datacenter}, doi = {10.1145/3342195.3387524}, abstract = {In many public and private Cloud systems, users need to specify a limit for the amount of resources (CPU cores and RAM) to provision for their workloads. A job that exceeds its limits might be throttled or killed, resulting in delaying or dropping end-user requests, so human operators naturally err on the side of caution and request a larger limit than the job needs. At scale, this results in massive aggregate resource wastage. To address this, Google uses Autopilot to configure resources automatically, adjusting both the number of concurrent tasks in a job (horizontal scaling) and the CPU/memory limits for individual tasks (vertical scaling). Autopilot walks the same fine line as human operators: its primary goal is to reduce slack - the difference between the limit and the actual resource usage - while minimizing the risk that a task is killed with an out-of-memory (OOM) error or its performance degraded because of CPU throttling. Autopilot uses machine learning algorithms applied to historical data about prior executions of a job, plus a set of finely-tuned heuristics, to walk this line. In practice, Autopiloted jobs have a slack of just 23\%, compared with 46\% for manually-managed jobs. Additionally, Autopilot reduces the number of jobs severely impacted by OOMs by a factor of 10. Despite its advantages, ensuring that Autopilot was widely adopted took significant effort, including making potential recommendations easily visible to customers who had yet to opt in, automatically migrating certain categories of jobs, and adding support for custom recommenders. At the time of writing, Autopiloted jobs account for over 48\% of Google's fleet-wide resource usage. } }
@incollection{theron2020reference, title = {Reference Architecture of an Autonomous Agent for Cyber Defense of Complex Military Systems}, author = {Th{\'e}ron, Paul and Kott, Alexander and Dra{\v{s}}ar, Martin and Rzadca, Krzysztof and LeBlanc, Beno{\^\i}t and Pihelgas, Mauno and Mancini, Luigi and de Gaspari, Fabio}, booktitle = {Adaptive Autonomous Secure Cyber Systems}, pages = {1--21}, year = {2020}, doi = {10.1007/978-3-030-33432-1_1}, publisher = {Springer}, keywords = {intelligent_agent} }
@article{kott2020introductory, title = {An introductory preview of Autonomous Intelligent Cyber-defense Agent reference architecture, release 2.0}, author = {Kott, Alexander and Th{\'e}ron, Paul and Mancini, Luigi V and Dushku, Edlira and Panico, Agostino and Dra{\v{s}}ar, Martin and LeBlanc, Beno{\^\i}t and Losiewicz, Paul and Guarino, Alessandro and Pihelgas, Mauno and Rzadca, Krzysztof}, journal = {The Journal of Defense Modeling and Simulation}, volume = {17}, number = {1}, pages = {51--54}, year = {2020}, doi = {10.1177/1548512919886136}, publisher = {SAGE Publications}, keywords = {intelligent_agent} }
@article{pascual2019sideeff, author = {Pascual, Fanny and Rzadca, Krzysztof}, title = {Optimizing Egalitarian Performance when Colocating Tasks with Types for Cloud Data Center Resource Management}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {2019}, optkey = {}, volume = {30}, number = {11}, pages = {2523-2535}, month = {November}, keywords = {resource_management, datacenter}, publisher = {IEEE}, abstract = {In data centers, up to dozens of tasks are colocated on a single physical machine. Machines are used more efficiently, but the performance of the tasks deteriorates, as the colocated tasks compete for shared resources. Since the tasks are heterogeneous, the resulting performance dependencies are complex. In our previous work [1], [2] we proposed a new combinatorial optimization model that uses two parameters of a task - its size and its type - to characterize how a task influences the performance of other tasks allocated to the same machine. In this paper, we study the egalitarian optimization goal: the aim is to optimize the performance of the worst-off task. This problem generalizes the classic makespan minimization on multiple processors (P||Cmax ). We prove that polynomially-solvable variants of P||Cmax are NP-hard for this generalization, and that the problem is hard to approximate when the number of types is not constant. For a constant number of types, we propose a PTAS, a fast approximation algorithm, and a series of heuristics. We simulate the algorithms on instances derived from a trace of one of Google clusters. Compared with baseline algorithms solving P||Cmax , our proposed algorithms aware of the types of the jobs lead to significantly better tasks' performance. The notion of type enables us to extend standard combinatorial optimization methods to handle degradation of performance caused by colocation. Types add a layer of additional complexity. However, our results - approximation algorithms and good average-case performance - show that types can be handled efficiently.}, url = {https://hal.archives-ouvertes.fr/hal-02102674}, doi = {10.1109/TPDS.2019.2911084}, optnote = {}, optannote = {} }
@inproceedings{theron18aica, author = {P. Theron and A. Kott and M. Drasar and K. Rzadca and B. LeBlanc and M. Pihelgas and L. Mancini and A. Panico}, booktitle = {2018 International Conference on Military Communications and Information Systems (ICMCIS)}, title = {Towards an active, autonomous and intelligent cyber defense of military systems: The {NATO} {AICA} reference architecture}, year = {2018}, pages = {1-9}, publisher = {IEEE}, url = {https://arxiv.org/abs/1806.08657}, keywords = {defence industry;military computing;military systems;multi-agent systems;active cyber defense;autonomous cyber defense;military systems;NATO AICA reference architecture;complex massively interconnected systems;sensors;global information grid;cyberattacks;multiagent systems;autonomous intelligent cyber defense agent;multiautonomous intelligent cyber defense agent;AICA-MAICA concept;intelligent cyber defense;human security operator;IST-152 Research and Technology Group;Computer architecture;Sensors;Malware;Computer security;Resilience;Monitoring;Decision making;intelligent_agent;autonomy;cyber warfare;cyber security}, doi = {10.1109/ICMCIS.2018.8398730} }
@inproceedings{pacholczyk18federatedclouds, author = {Pacholczyk, Milosz and Rzadca, Krzysztof}, title = {Fair non-monetary scheduling in federated clouds}, doi = {10.1145/3195870.3195873}, keywords = {resource_management, datacenter}, booktitle = {CrossCloud'18: 5th Workshop on CrossCloud Infrastructures \& Platforms, EuroSys'18 Workshops}, url = {https://arxiv.org/abs/1803.06178}, year = {2018}, pages = {3:1--3:6}, publisher = {ACM}, abstract = {In a hybrid cloud, individual cloud service providers (CSPs) often have incentive to use each other's resources to off-load peak loads or place load closer to the end user. However, CSPs have to keep track of contributions and gains in order to disincentivize long-term free-riding. We show CloudShare, a distributed version of a load balancing algorithm DirectCloud based on the Shapley value---a powerful fairness concept from game theory. CloudShare coordinates CSPs by a ZooKeeper-based coordination layer; each CSP runs a broker that interacts with local resources (such as Kubernetes-managed clusters). We quantitatively evaluate our implementation by simulation. The results confirm that CloudShare generates on the average more fair schedules than the popular FairShare algorithm. We believe our results show an viable alternative to monetary methods based on, e.g., spot markets.}, opteditor = {}, optvolume = {}, optnumber = {}, optseries = {}, optpages = {}, optmonth = {}, optaddress = {}, optorganization = {}, optpublisher = {}, optnote = {}, optannote = {} }
@inproceedings{pascual2018collectiveschedules, author = {Pascual, Fanny and Rzadca, Krzysztof and Skowron, Piotr}, title = {Collective Schedules: Scheduling Meets Computational Social Choice}, optcrossref = {}, optkey = {}, keywords = {scheduling, resource_management}, booktitle = {AAMAS'18}, year = {2018}, pages = {667-675}, url = {http://dl.acm.org/citation.cfm?id=3237383.3237482}, acmid = {3237482}, publisher = {IFAAMAS}, abstract = {When scheduling public works or events in a shared facility one needs to accommodate preferences of a population. We formalize this problem by introducing the notion of a collective schedule. We show how to extend fundamental tools from social choice theory---positional scoring rules, the Kemeny rule and the Condorcet principle---to collective scheduling. We study the computational complexity of finding collective schedules. We also experimentally demonstrate that optimal collective schedules can be found for instances with realistic sizes.}, opteditor = {}, optvolume = {}, optnumber = {}, optseries = {}, optpages = {}, optmonth = {}, optaddress = {}, optorganization = {}, optpublisher = {}, optnote = {}, optannote = {} }
@article{pascual2018colocation, author = {Pascual, Fanny and Rzadca, Krzysztof}, title = {Colocating tasks in data~centers using a side-effects performance model}, journal = {European Journal of Operational Research}, year = 2018, keywords = {scheduling, resource_management, datacenter}, doi = {10.1016/j.ejor.2018.01.046}, volume = 268, number = 2, pages = {450-462}, abstract = {In data~centers, many tasks (services, virtual machines or computational jobs) share a single physical machine. We explore a new resource management model for such colocation. Our model uses two parameters of a task---its size and its type---to characterize how a task influences the performance of the other tasks allocated on the same machine. As typically a data~center hosts many similar, recurring tasks (e.g. a webserver, a database, a CPU-intensive computation), the resource manager should be able to construct these types and their performance interactions. In particular, we minimize the total cost in a model in which each task's cost is a function of the total sizes of tasks allocated on the same machine (each type is counted separately). We show that for a linear cost function the problem is strongly NP-hard, but polynomially-solvable in some particular cases. We propose an algorithm polynomial in the number of tasks (but exponential in the number of types and machines) and another algorithm polynomial in the number of tasks and machines (but exponential in the number of types and admissible sizes of tasks). We also propose a polynomial time approximation algorithm, and, in the case of a single type, a polynomial time exact algorithm. For convex costs, we prove that, even for a single type, the problem becomes NP-hard, and we propose an approximation algorithm. We experimentally verify our algorithms on instances derived from a real-world data~center trace. While the exact algorithms are infeasible for large instances, the approximations and heuristics deliver reasonable performance. } }
@inproceedings{janus2017slo-colocation, author = {Janus, Pawel and Rzadca, Krzysztof}, title = {{SLO}-aware Colocation of Data Center Tasks Based on Instantaneous Processor Requirements}, keywords = {scheduling, resource_management, datacenter}, booktitle = {ACM SoCC'17, ACM Symposium on Cloud Computing}, year = {2017}, pages = {256--268}, publisher = {ACM}, abstract = {In a cloud data center, a single physical machine simultaneously executes dozens of highly heterogeneous tasks. Such colocation results in more efficient utilization of machines, but, when tasks' requirements exceed available resources, some of the tasks might be throttled down or preempted. We analyze version 2.1 of the Google cluster trace that shows short-term (1 second) task CPU usage. Contrary to the assumptions taken by many theoretical studies, we demonstrate that the empirical distributions do not follow any single distribution. However, high percentiles of the total processor usage (summed over at least 10 tasks) can be reasonably estimated by the Gaussian distribution. We use this result for a probabilistic fit test, called the Gaussian Percentile Approximation (GPA), for standard bin-packing algorithms. To check whether a new task will fit into a machine, GPA checks whether the resulting distribution's percentile corresponding to the requested service level objective, SLO is still below the machine's capacity. In our simulation experiments, GPA resulted in colocations exceeding the machines' capacity with a frequency similar to the requested SLO.}, pdf = {papers/slocolocation-socc2017.pdf}, doi = {10.1145/3127479.3132244}, url = {http://arxiv.org/abs/1709.01384} }
@inproceedings{milka2017dfuntest, author = {Milka, Grzegorz and Rzadca, Krzysztof}, title = {Dfuntest: A Testing Framework for Distributed Applications}, keywords = {p2p}, booktitle = {PPAM 2017, International Conference on Parallel Processing and Applied Mathematics}, year = {2017}, series = {LNCS}, volume = {10777}, doi = {10.1007/978-3-319-78024-5_35}, url = {http://arxiv.org/abs/1803.04442}, publisher = {Springer} }
@inproceedings{pascual2017maxcost, author = {Pascual, Fanny and Rzadca, Krzysztof}, title = {Optimizing egalitarian performance in the side-effects model of colocation for data center resource management}, keywords = {scheduling, resource_management, datacenter}, booktitle = {Euro-Par 2017 Proceedings}, year = {2017}, pages = {206-219}, doi = {10.1007/978-3-319-64203-1_15}, url = {https://arxiv.org/abs/1610.07339}, pdf = {papers/maxcost_sideff-europar2017.pdf}, volume = {10417}, series = {LNCS}, publisher = {Springer}, abstract = {In data centers, up to dozens of tasks are colocated on a single physical machine. Machines are used more efficiently, but tasks' performance deteriorates, as colocated tasks compete for shared resources. As tasks are heterogeneous, the resulting performance dependencies are complex. In our previous work [18] we proposed a new combinatorial optimization model that uses two parameters of a task - its size and its type - to characterize how a task influences the performance of other tasks allocated to the same machine. In this paper, we study the egalitarian optimization goal: maximizing the worst-off performance. This problem generalizes the classic makespan minimization on multiple processors (P||Cmax). We prove that polynomially-solvable variants of multiprocessor scheduling are NP-hard and hard to approximate when the number of types is not constant. For a constant number of types, we propose a PTAS, a fast approximation algorithm, and a series of heuristics. We simulate the algorithms on instances derived from a trace of one of Google clusters. Algorithms aware of jobs' types lead to better performance compared with algorithms solving P||Cmax. The notion of type enables us to model degeneration of performance caused by using standard combinatorial optimization methods. Types add a layer of additional complexity. However, our results - approximation algorithms and good average-case performance - show that types can be handled efficiently. The notion of type enables us to model degeneration of performance caused by using standard combinatorial optimization methods. Types add a layer of additional complexity. However, our results - approximation algorithms and good average-case performance - show that types can be handled efficiently. }, opteditor = {}, optvolume = {}, optnumber = {}, optseries = {}, optmonth = {}, optaddress = {}, optorganization = {}, optpublisher = {}, optnote = {}, optannote = {} }
@article{skowron16complexprojects, author = {Skowron, Piotr and Rzadca, Krzysztof and Datta, Anwitaman}, title = {Cooperation and Competition when Bidding for Complex Projects: Centralized and Decentralized Perspectives}, journal = {IEEE Intelligent Systems}, year = {2017}, keywords = {resource_management,scheduling}, doi = {10.1109/MIS.2017.4}, url = {https://arxiv.org/abs/1402.2970}, abstract = {To successfully complete a complex project, be it a construction of an airport or of a backbone IT system, agents (companies or individuals) must form a team having required competences and resources. A team can be formed either by the project issuer based on individual agents' offers (centralized formation); or by the agents themselves (decentralized formation) bidding for a project as a consortium---in that case many feasible teams compete for the contract. We investigate rational strategies of the agents (what salary should they ask? with whom should they team up?). We propose concepts to characterize the stability of the winning teams and study their computational complexity.}, optkey = {}, volume = {32}, number = {1}, pages = {17-23}, optmonth = {}, optnote = {}, optannote = {} }
@article{skowron2015p2preplica, author = {Skowron, Piotr and Rzadca, Krzysztof}, title = {Flexible replica placement for optimized P2P backup on heterogeneous, unreliable machines}, journal = {Concurrency and Computation: Practice and Experience}, issn = {1532-0634}, url = {http://dx.doi.org/10.1002/cpe.3491}, doi = {10.1002/cpe.3491}, annote = {HPCS 2013 Selected Papers}, pdf = {papers/bapap_ccpe2015.pdf}, keywords = {p2p,distributed storage, enterprise backup, data replication, unstructured P2P networks, availability}, year = {2016}, volume = {28}, number = {7}, pages = {2166-2186}, abstract = {P2P architecture is a viable option for enterprise backup. In contrast to dedicated backup servers, nowadays a standard solution, making backups directly on organization's workstations should be cheaper as existing hardware is used; more efficient as there is no single bottleneck server; and more reliable as the machines can be geographically dispersed. We present an architecture of a p2p backup system that uses pairwise replication contracts between a data owner and a replicator. In contrast to a standard p2p storage system using directly a DHT, the contracts allow our system to optimize replicas' placement depending on a specific optimization strategy, and so to take advantage of the heterogeneity of the machines and the network. Such optimization is particularly appealing in the context of backup: replicas can be geographically dispersed, the load sent over the network can be minimized, or the optimization goal can be to minimize the backup/restore time. However, managing the contracts, keeping them consistent and adjusting them in response to dynamically changing environment is challenging. We built a scientific prototype and ran experiments on 150 workstations in our university's computer laboratories and, separately, on 50 PlanetLab nodes. We found out that the main factor affecting the performance of the system is the availability of the machines. Yet, our main conclusion is that it is possible to build an efficient and reliable backup system on highly unavailable machines, as our computers had just 13\% average availability.} }
@inproceedings{skowron2015geolb, author = {Skowron, Piotr and Rzadca, Krzysztof}, title = {Geographically Distributed Load Balancing with (Almost) Arbitrary Load Functions}, keywords = {scheduling, resource_management}, booktitle = {HiPC 2015, 22nd IEEE/ACM International Conference on High Performance Computing}, year = {2015}, abstract = {In geographically-distributed systems, communication latencies are non-negligible. The perceived processing time of a request is thus composed of the time needed to route the request to the server and the true processing time. Once a request reaches a target server, the processing time depends on the total load of that server; this dependency is described by a load function. We consider a broad class of load functions; we just require that they are convex and two times differentiable. In particular our model can be applied to heterogeneous systems in which every server has a different load function. We present optimization centralized and a decentralized algorithms for load balancing. We prove bounds on the algorithms' convergence. To the best of our knowledge these bounds were not known even for the special cases studied previously (queuing theory and batches of requests). Both algorithms are any-time and self-stabilizing algorithms.}, pdf = {papers/geo_lb-hipc2015.pdf}, doi = {10.1109/HiPC.2015.16}, opteditor = {}, optvolume = {}, optnumber = {}, optseries = {}, optpages = {}, optmonth = {}, optaddress = {}, optorganization = {}, publisher = {IEEE}, optnote = {}, optannote = {} }
@inproceedings{pascual2015partition, author = {Pascual, Fanny and Rzadca, Krzysztof}, title = {Partition with Side Effects}, keywords = {scheduling, resource_management, datacenter}, booktitle = {HiPC 2015, 22nd IEEE/ACM International Conference on High Performance Computing}, year = 2015, abstract = {In data centers, many tasks (services, virtual machines or computational jobs) share a single physical machine. We propose a new resource management model for such colocation. Our model uses two parameters of a task---its size and its type---to characterize how a task influences the performance of the other tasks allocated on the same machine. As typically a data~center hosts many similar, recurring tasks (e.g.: a webserver, a database, a CPU-intensive computation), the resource manager should be able to construct these types and their performance interactions. Moreover, realistic variants of our model are polynomially-solvable, in contrast to the NP-hard vector packing used previously. In particular, we minimize the total cost in a model in which each task's cost is a function of the total sizes of tasks allocated on the same machine (each type is counted separately). We show that for a linear cost function the problem is strongly NP-hard, but polynomially-solvable in some particular cases. We propose an algorithm polynomial in the number of tasks (but exponential in the number of types and machines); and another algorithm polynomial in the number of tasks and machines (but exponential in the number of types and admissible sizes of tasks). When there is a single type, we give a polynomial time algorithm. We also prove that, even for a single type, the problem becomes NP-hard for convex costs.}, pdf = {papers/partition_with_side_effets-hipc2015.pdf}, doi = {10.1109/HiPC.2015.52}, publisher = {IEEE} }
@article{rzadca2015mechanisms, author = {Rzadca, Krzysztof and Datta, Anwitaman and Kreitz, Gunnar and Buchegger, Sonja}, title = {Game-Theoretic Mechanisms to Increase Data Availability in Decentralized Storage Systems}, journal = {ACM Transactions on Autonomous and Adaptive Systems}, year = {2015}, optkey = {}, doi = {10.1145/2723771}, volume = {10}, number = {3}, pages = {14:1-14:32}, optmonth = {}, keywords = {p2p, p2p_storage, game theory, price of anarchy}, pdf = {papers/mech-taas2015.pdf}, abstract = {In a decentralized storage system, agents replicate each other's data in order to increase availability. Compared to organizationally-centralized solutions, such as cloud storage, a decentralized storage system requires less trust in the provider and may result in smaller monetary costs. Our system is based on reciprocal storage contracts which allow the agents to adopt to changes in their replication partners' availability (by dropping inefficient contracts and forming new contracts with other partners). The data availability provided by the system is a function of the participating agents' availability. However, a straightforward system in which agents' matching is decentralized uses the given agent availability inefficiently. As agents are autonomous, the highly-available agents form cliques replicating data between each other, which makes the system too hostile for the weakly-available newcomers. In contrast, a centralized, equitable matching is not incentive-compatible: it does not reward users for keeping their software running. We solve this dilemma by a mixed solution: an ``adoption'' mechanism in which highly-available agents donate some replication space, which in turn is used to help the worst-off agents. We show that the adoption motivates agents to increase their availability (is incentive-compatible); but also that it is sufficient for acceptable data availability for weakly-available agents.}, optannote = {} }
@inproceedings{georgiou2015efs, author = {Georgiou, Yiannis and Glesser, David and Rzadca, Krzysztof and Trystram, Denis}, title = {A Scheduler-Level Incentive Mechanism for Energy Efficiency in {HPC}}, keywords = {scheduling, resource_management}, booktitle = {CCGrid 2015, 15th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing}, year = {2015}, pages = {617-626}, doi = {10.1109/CCGrid.2015.101}, pdf = {papers/energyfairshare_ccgrid2015.pdf}, abstract = {Energy consumption has become one of the most important factors in High Performance Computing platforms. However, while there are various algorithmic and programming techniques to save energy, a user has currently no incentive to employ them, as they might result in worse performance. We propose to manage the energy budget of a supercomputer through EnergyFairShare (EFS), a FairShare-like scheduling algorithm. FairShare is a classic scheduling rule that prioritizes jobs belonging to users who were assigned small amount of CPU-second in the past. Similarly, EFS keeps track of users' consumption of Watt-seconds and prioritizes those whom jobs consumed less energy. Therefore, EFS incentives users to optimize their code for energy efficiency. Having higher priority, jobs have smaller queuing times and, thus, smaller turn-around time. To validate this principle, we implemented EFS in a scheduling simulator and processed workloads from various HPC centers. The results show that, by reducing her energy consumption, a user will reduce her stretch (slowdown), compared to increasing her energy consumption. To validate the general feasibility of our approach, we also implemented EFS as an extension for SLURM, a popular HPC resource and job management system. We validated our plugin both by emulating a large scale platform, and by experiments upon a real cluster with monitored energy consumption. We observed smaller waiting times for energy efficient users.} }
@inproceedings{skowron2014peopleprocessors, author = {Skowron, Piotr and Rzadca, Krzysztof and Datta, Anwitaman}, title = {People are Processors: Coalitional Auctions for Complex Projects (Extended Abstract)}, keywords = {scheduling, resource_management}, booktitle = {AAMAS 2014, 13th International Conference on Autonomous Agents and Multiagent Systems}, year = {2014}, pdf = {papers/peopleprocessors_aamas2014.pdf}, abstract = {To successfully complete a complex project, be it a construction of an airport or of a backbone IT system or crowd-sourced projects, agents (companies or individuals) must form a team (a coalition) having required competences and resources. A team can be formed either by the project issuer based on individual agents' offers (centralized formation); or by the agents themselves (decentralized formation) bidding for a project as a consortium---in that case many feasible teams compete for the employment contract. In these models, we investigate rational strategies of the agents (what salary should they ask? with whom should they team up?) under different organizations of the market. We propose various concepts allowing to characterize the stability of the winning teams. We show that there may be no (rigorously) strongly winning coalition, but the weakly winning and the auction-winning coalitions are guaranteed to exist. In a general setting, with an oracle that decides whether a coalition is feasible, we show how to find winning coalitions with a polynomial number of calls to the oracle. We also determine the complexity of the problem in a special case in which a project is a set of independent tasks. Each task must be processed by a single agent, but processing speeds differ between agents and tasks. }, url = {http://arxiv.org/abs/1402.2970v1}, pages = {1525-1526} }
@inproceedings{emeras2013ostrich, author = {Emeras, Joseph and Pinheiro, Vinicius and Rzadca, Krzysztof and Trystram, Denis}, title = {OStrich: Fair Scheduling for Multiple Submissions}, optcrossref = {}, keywords = {scheduling, resource_management}, booktitle = {PPAM 2013, International Conference on Parallel Processing and Applied Mathematics}, pages = {26-37}, year = {2014}, pdf = {papers/ostrich-ppam2013.pdf}, abstract = {Campaign Scheduling is characterized by multiple job submissions issued from multiple users over time. This model perfectly suits today's systems since most available parallel environments have multiple users sharing a common infrastructure. When scheduling individually the jobs submitted by various users, one crucial issue is to ensure fairness. This work presents a new fair scheduling algorithm called OStrich whose principle is to maintain a virtual time-sharing schedule in which the same amount of processors is assigned to each user. The completion times in the virtual schedule determine the execution order on the physical processors. Then, the campaigns are interleaved in a fair way by OStrich. For independent sequential jobs, we show that OStrich guarantees the stretch of a campaign to be proportional to campaign's size and the total number of users. The stretch is used for measuring by what factor a workload is slowed down relative to the time it takes on an unloaded system. The theoretical performance of our solution is assessed by simulating OStrich compared to the classical FCFS algorithm, issued from synthetic workload traces generated by two different user profiles. This is done to demonstrate how OStrich benefits both types of users, in contrast to FCFS.}, volume = {8385}, optnumber = {}, series = {LNCS}, optaddress = {}, optmonth = {}, optorganization = {}, publisher = {Springer}, optnote = {}, optannote = {} }
@inproceedings{skowron2013measuringfairness, author = {Skowron, Piotr and Rzadca, Krzysztof }, title = {Fair share is not enough: measuring fairness in scheduling with cooperative game theory}, keywords = {scheduling, resource_management}, booktitle = {PPAM 2013, International Conference on Parallel Processing and Applied Mathematics}, pdf = {papers/fairnessOfAlgorithms-ppam2013.pdf}, pages = {38-48}, year = {2014}, opteditor = {}, volume = {8385}, optnumber = {}, series = {LNCS}, optaddress = {}, optmonth = {}, optorganization = {}, publisher = {Springer}, optnote = {}, optannote = {}, abstract = {We consider the problem of fair scheduling in a multi-organizational system in which organizations are contributing their own resources to the global pool and the jobs to be processed on the common resources. We consider on-line, non-clairvoyant scheduling of sequential jobs without preemption. The organizations must agree on the order of executing the jobs, i.e. on the scheduling algorithm. To ensure that the organizations are willing to cooperate the scheduling algorithm must be fair. To characterize fairness, we use cooperative game theory approach. The contribution of an organization is computed based on how this organization influences the utility (which can be any metric, e.g., flow time, turnaround, resource allocation, etc.) of all organizations. Formally, the contribution of the organization is its Shapley value in the cooperative game. The scheduling algorithm should ensure that the contributions of the organizations are close to their utilities. Our previous work proves that this problem is NP-hard and hard to approximate. In this paper we propose a heuristic scheduling algorithm for the fair scheduling problem. We experimentally evaluate the heuristic and compare its fairness to other algorithms (fair share, round robin) and the exact exponential algorithm. Our results show that fairness of the heuristic algorithm is close to the optimal. The difference between our heuristic and the fair share algorithm is more visible on longer traces with more organizations. We believe that these results prove that fair share might not be an optimal solution for multi-organizational systems.} }
@inproceedings{Skowron2013p2pbackup, author = {Skowron, Piotr and Rzadca, Krzysztof}, title = {Exploring heterogeneity of unreliable machines for p2p backup}, keywords = {p2p}, booktitle = {HPCS 2013, International Conference on High Performance Computing \& Simulation}, pdf = {papers/p2pbackup_hpcs2013.pdf}, abstract = {P2P architecture is a viable option for enterprise backup. In contrast to dedicated backup servers, nowadays a standard solution, making backups directly on organization's workstations should be cheaper (as existing hardware is used), more efficient (as there is no single bottleneck server) and more reliable (as the machines are geographically dispersed). We present the architecture of a p2p backup system that uses pairwise replication contracts between a data owner and a replicator. In contrast to standard p2p storage systems using directly a DHT, the contracts allow our system to optimize replicas' placement depending on a specific optimization strategy, and so to take advantage of the heterogeneity of the machines and the network. Possible goals include geographical dispersion of the replicas, minimization of backup or restore time or the usage of long-range links. However, managing the contracts, keeping them consistent and adjusting them in response to dynamically changing environment is challenging. We built a scientific prototype and ran the experiments on 150 workstations in the university's computer laboratories and, separately, on 50 PlanetLab nodes. We found out that the main factor affecting the quality of the system is the availability of the machines. Yet, our main conclusion is that it is possible to build an efficient and reliable backup system on highly unreliable machines (our computers had just 13\% average availability).}, pages = {91-98}, year = {2013}, doi = {10.1109/HPCSim.2013.6641398}, url = {http://arxiv.org/abs/1212.0427}, opteditor = {}, optvolume = {}, optnumber = {}, optseries = {}, optaddress = {}, optmonth = {}, optorganization = {}, publisher = {IEEE}, optnote = {}, optannote = {} }
@inproceedings{Skowron2013shapleyscheduling, author = {Skowron, Piotr and Rzadca, Krzysztof}, title = {Non-monetary fair scheduling --- a cooperative game theory approach}, keywords = {scheduling,resource_management,fairness}, booktitle = {SPAA 2013, 25th ACM Symposium on Parallelism in Algorithms and Architectures}, url = {http://arxiv.org/abs/1302.0948}, doi = {10.1145/2486159.2486169}, pdf = {papers/shapleyScheduling-spaa2013.pdf}, abstract = {We consider a multi-organizational system in which each organization contributes processors to the global pool but also jobs to be processed on the common resources. The fairness of the scheduling algorithm is essential for the stability and even for the existence of such systems (as organizations may refuse to join an unfair system). We consider on-line, non-clairvoyant scheduling of sequential jobs. The started jobs cannot be stopped, canceled, preempted, or moved to other processors. We consider identical processors, but most of our results can be extended to related or unrelated processors. We model the fair scheduling problem as a cooperative game and we use the Shapley value to determine the ideal fair schedule. In contrast to the current literature, we do not use money to assess the relative utilities of jobs. Instead, to calculate the contribution of an organization, we determine how the presence of this organization influences the performance of other organizations. Our approach can be used with arbitrary utility function (e.g., flow time, tardiness, resource utilization), but we argue that the utility function should be strategy resilient. The organizations should be discouraged from splitting, merging or delaying their jobs. We present the unique (to within a multiplicative and additive constants) strategy resilient utility function. We show that the problem of fair scheduling is NP-hard and hard to approximate. However, for unit-size jobs, we present a fully polynomial-time randomized approximation scheme (FPRAS). We also show that the problem parametrized with the number of organizations is fixed parameter tractable (FPT). In cooperative game theory, the Shapley value is considered in many contexts as ``the'' fair solution. Our results show that, although the problem for the large number of organizations is computationally hard, this solution concept can be used in scheduling (for instance, as a benchmark for measuring fairness of heuristic algorithms).}, pages = {288--297}, year = {2013}, opteditor = {}, optvolume = {}, optnumber = {}, optseries = {}, optaddress = {New York, NY, USA}, optmonth = {}, optorganization = {}, publisher = {ACM}, optnote = {}, optannote = {} }
@inproceedings{Skowron2013networkdelaylb, author = {Skowron, Piotr and Rzadca, Krzysztof}, title = {Network delay-aware load balancing in selfish and cooperative distributed systems}, keywords = {resource_management}, booktitle = {HCW 2013, 22st International Heterogeneity in Computing Workshop (in conjunction with IPDPS 2013)}, url = {http://arxiv.org/abs/1212.0421}, pdf = {papers/contentDelivery_hcw2013.pdf}, abstract = {We consider a geographically distributed request processing system composed of various organizations and their servers connected by the Internet. The latency a user observes is a sum of communication delays and the time needed to handle the request on a server. The handling time depends on the server congestion, i.e. the total number of requests a server must handle. We analyze the problem of balancing the load in a network of servers in order to minimize the total observed latency. We consider both cooperative and selfish organizations (each organization aiming to minimize the latency of the locally-produced requests). The problem can be generalized to the task scheduling in a distributed cloud; or to content delivery in an organizationally-distributed CDNs. In a cooperative network, we show that the problem is polynomially solvable. We also present a distributed algorithm iteratively balancing the load. We show how to estimate the distance between the current solution and the optimum based on the amount of load exchanged by the algorithm. During the experimental evaluation, we show that the distributed algorithm is efficient, therefore it can be used in networks with dynamically changing loads. In a network of selfish organizations, we prove that the price of anarchy (the worst-case loss of performance due to selfishness) is low when the network is homogeneous and the servers are loaded (the request handling time is high compared to the communication delay). After relaxing these assumptions, we assess the loss of performance caused by the selfishness experimentally, showing that it remains low. Our results indicate that a set of servers handling requests, connected in a heterogeneous network, can be efficiently managed by a distributed algorithm. Additionally, even if the network is organizationally distributed, with individual organizations optimizing performance of their requests, the network remains efficient.}, pages = {7-18}, year = {2013}, opteditor = {}, optvolume = {}, optnumber = {}, series = {IPDPSW}, optaddress = {}, optmonth = {}, optorganization = {}, publisher = {IEEE}, optnote = {}, optannote = {} }
@article{LiuXin2012stereotypes, author = {Xin, Liu and Datta, Anwitaman and Rzadca, Krzysztof}, title = {Trust beyond reputation: A computational trust model based on stereotypes}, journal = {Electronic Commerce Research and Applications}, year = {2013}, keywords = {trust}, doi = {10.1016/j.elerap.2012.07.001}, url = {http://arxiv.org/abs/1103.2215}, abstract = {Models of computational trust support users in taking decisions. They are commonly used to guide users' judgements in online auction sites; or to determine quality of contributions in Web 2.0 sites. However, most existing systems require historical information about the past behavior of the specific agent being judged. In contrast, in real life, to anticipate and to predict a stranger's actions in absence of the knowledge of such behavioral history, we often use our "instinct"- essentially stereotypes developed from our past interactions with other "similar" persons. In this paper, we propose StereoTrust, a computational trust model inspired by stereotypes as used in real-life. A stereotype contains certain features of agents and an expected outcome of the transaction. When facing a stranger, an agent derives its trust by aggregating stereotypes matching the stranger's profile. Since stereotypes are formed locally, recommendations stem from the trustor's own personal experiences and perspective. Historical behavioral information, when available, can be used to refine the analysis. According to our experiments using Epinions.com dataset, StereoTrust compares favorably with existing trust models that use different kinds of information and more complete historical information.}, optkey = {}, volume = {12}, issue = {1}, pages = {24-39}, month = {January-February}, optnote = {}, optannote = {} }
@inproceedings{Pinheiro2012campaign, author = {Vinicious Pinheiro and Krzysztof Rzadca and Denis Trystram}, title = {Campaign Scheduling}, booktitle = {IEEE International Conference on High Performance Computing (HiPC), Proceedings}, keywords = {scheduling, fairness}, year = 2012, pages = {1-10}, pdf = {papers/campaign_scheduling_hipc2012.pdf}, abstract = {We study the problem of scheduling in parallel systems with many users. We analyze scenarios with many submissions issued over time by several users. These submissions contain one or more jobs; the set of submissions are organized in successive campaigns. Jobs belonging to a single campaign are sequential and independent, but any job from a campaign cannot start until all the jobs from the previous campaign are completed. Each user's goal is to minimize the sum of flow times of his campaigns. We define a theoretical model for Campaign Scheduling and show that, in the general case, it is NP-hard. For the single-user case, we show that an rho-approximation scheduling algorithm for the (classic) parallel job scheduling problem is also an rho-approximation for the Campaign Scheduling problem. For the general case with k users, we establish a fairness criterion inspired by time sharing. We propose FAIRCAMP, a scheduling algorithm which uses campaign deadlines to achieve fairness among users between consecutive campaigns. We prove that FAIRCAMP increases the flow time of each user by a factor of at most k rho compared with a machine dedicated to the user. We also prove that FAIRCAMP is a rho-approximation algorithm for the maximum stretch. By simulation, we compare FAIRCAMP to the First-Come-First-Served (FCFS). We show that, compared with FCFS, FAIRCAMP reduces the maximum stretch by up to 3.4 times. The difference is significant in systems used by many (k>5) users. Our results show that, rather than just individual, independent jobs, campaigns of jobs can be handled by the scheduler efficiently and fairly.} }
@incollection{Datta2011DOSN, author = {Datta, Anwitaman and Buchegger, Sonja and Le-Hung Vu and Thornsten, Strufe and Rzadca, Krzysztof}, title = {Decentralized Online Social Networks}, booktitle = {Handbook of Social Network Technologies and Applications}, publisher = {Springer}, year = 2011, editor = {Furht, B.}, pages = {349-378}, optaddress = {New York Dordrecht Heidelberg London}, keywords = {p2p}, owner = {krz}, timestamp = {2010.08.29}, url = {http://www.springer.com/computer/swe/book/978-1-4419-7141-8} }
@incollection{Datta2010ServerlessSocialSoftware, author = {Datta, Anwitaman and Rzadca, Krzysztof and Ang, Sally and Goh Chee Hong}, title = {Serverless Social Software for Nomadic Collaboration}, booktitle = {e-Research Collaboration: Theory, Techniques and Challenges}, publisher = {Springer}, year = {2010}, editor = {Murugan, A.}, pages = {85-103}, address = {New York Dordrecht Heidelberg London}, abstract = {Recent portable devices, from sophisticated mobile phones, to netbooks, thanks to wireless networking and powerful batteries, give hardware support for collaborative work on the go, even when the Internet connection is not available. Yet, current collaboration software, from Concurent Versions System (CVS) or Subversion (SVN) repositories to Google Docs and collaborative versions of commercial applications (CoWord, CoMaya), requires a dedicated server to synchronize clients, and thus a stable network connection. In this chapter, we present two tools that use peer-to-peer paradigm to build serverless collaboration networks. PBDMS enables users to share, search and review bibliographic databases. SharedMind provides collaborative document editing to FreeMind, popular, open source mind-mapping software. Both tools handle disconnections and network divisions, enabling users to continue their work and to synchronize with their reachable peers. Both tools have been implemented and tested in small scale. PBDMS is available for download at http://code.google.com/p/bibliographicsocialinfosys/; SharedMind is available at http://code.google.com/p/sharedmind . We believe that such seamless, flexible collaboration applications provide the degree of freedom promised by the recent portable devices, yet not fully used by the current applications.}, keywords = {p2p, collaborative applications}, owner = {krz}, timestamp = {2010.08.29}, url = {http://www.springer.com/economics/book/978-3-642-12256-9} }
@article{Trystram2010ApproximationAlgorithms, author = {Dutot, Pierre-Francois and Pascual, Fanny and Rzadca, Krzysztof and Trystram, Denis}, title = {Approximation Algorithms for the Multi-Organization Scheduling Problem}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {2011}, volume = {22}, issue = {11}, pages = {1888-1895}, doi = {10.1109/TPDS.2011.47}, pdf = {papers/mocca-tpds.pdf}, keywords = {scheduling, resource_management}, owner = {krz}, abstract = {The distributed nature of new computing platforms results in the problem of scheduling parallel jobs produced by several independent organizations that have each their own rules. They have no direct control over the whole system, thus it is necessary to revisit classical scheduling with locality constraints. In this article, we consider distributed computing systems in which each organization has its own resources. Each organization aims at minimizing the execution times of its own jobs. We introduce a global centralized mechanism for designing a collaborative solution that improves the global performance of the system while respecting organizations' selfish objectives. The proposed algorithm is proved to have an approximation ratio equal to 3 over the global optimal makespan and this bound is shown to be asymptotically tight (when the number of organizations is large). Several variants of this problem are also studied. Then, we derive another algorithm that improves in practice these solutions by further balancing the schedules. Finally, we provide some experiments based on simulations that demonstrate a very good efficiency of this last algorithm on typical instances.}, timestamp = {2010.08.29} }
@inproceedings{ang2010sharedmind, title = {SharedMind: A tool for collaborative mind-mapping}, author = {Ang, S. and Rzadca, K. and Datta, A.}, booktitle = {Multimedia and Expo (ICME), 2010 IEEE International Conference on}, pages = {1154--1155}, year = {2010}, organization = {IEEE} }
@incollection{Dutot2009MultiObjectiveScheduling, author = {P.-F Dutot and K. Rzadca and E. Saule and D. Trystram}, title = {Multi-Objective Scheduling}, booktitle = {Introduction to Scheduling}, publisher = {Chapman \& Hall/CRC}, year = {2009}, editor = {Y. Robert and F. Vivien}, series = {Computational Science}, keywords = {resource_management, scheduling}, owner = {krz}, pages = {219-251}, timestamp = {2009.08.18}, url = {http://www.crcpress.com/product/isbn/9781420072730} }
@incollection{Hupa2010interdisciplinary, author = {Hupa, A. and Rzadca, K. and Wierzbicki, A. and Datta, A.}, title = {Interdisciplinary matchmaking: Choosing collaborators by skill, acquaintance and trust}, booktitle = {Computational Social Network Analysis}, publisher = {Springer}, year = {2010}, editor = {Ajith Abraham and Aboul-Ella Hassanien and Vaclav Snasel}, series = {Computer Communication and Networks}, pages = {319--347}, doi = {10.1007/978-1-84882-229-0_12}, keywords = {trust}, url = {http://www.springerlink.com/content/m75j67354387gj06/} }
@inproceedings{Kaszuba2008DiscoveringMostTrusted, author = {T. Kaszuba and K. Rzadca and A. Wierzbicki}, title = {Discovering the Most Trusted Agents without Central Control}, booktitle = {Embedded and Ubiquitous Computing, IEEE/IFIP International Conference on}, year = {2008}, volume = {2}, pages = {616-621}, publisher = {IEEE Computer Society}, doi = {10.1109/EUC.2008.126}, keywords = {trust} }
@inproceedings{Krystek2008ComparisonofCentralized, author = {M. Krystek and K. Kurowski and A. Oleksiak and K. Rzadca}, title = {Comparison of Centralized and Decentralized Scheduling Algorithms using GSSIM Simulation Environment}, booktitle = {Grid Computing: Achievements and Prospects}, year = {2008}, editor = {Sergei Gorlatch, Paraskevi Fragopoulou and Thierry Priol}, pages = {185--196}, publisher = {Springer}, doi = {10.1007/978-0-387-09457-1_16}, editors = {S. Gorlatch and P. Fragopoulu and T. Priol}, pdf = {papers/comp_centr_decentr_gssim.pdf}, keywords = {resource_management, scheduling} }
@article{Pascual2009CooperationinMultiOrganization, author = {F. Pascual and K. Rzadca and D. Trystram}, title = {Cooperation in Multi-Organization Scheduling}, journal = {Concurrency\&Computation: Practice and Experience}, year = {2009}, volume = {21}, pages = { 905-921 }, publisher = {Wiley InterScience}, abstract = {The distributed nature of the grid results in the problem of scheduling parallel jobs produced by several independent organizations that have partial control over the system. We consider systems in which each organization owns a cluster of processors. Each organization wants its tasks to be completed as soon as possible. In this paper, we model an off-line system consisting of N identical clusters of m processors. We show that it is always possible to produce a collaborative solution that respects participants' selfish goals, at the same time improving the global performance of the system. We propose an algorithm (called MOLBA) with a guaranteed worst-case performance ratio on the global makespan, equal to 4. Next, we show that a better bound (equal to 3) can be obtained in a specific case when the last completed job requires at most m-2 processors. Then, we derive another algorithm (called ILBA) that in practice improves the proposed, guaranteed solution by further balancing the schedules. Finally, by an extensive evaluation by simulation, we show that the algorithms are efficient on typical instances.}, doi = {10.1002/cpe.v21:7}, pdf = {papers/cooperation_multiorg_scheduling-conccomp09.pdf}, issue = { 7 }, keywords = {resource_management, scheduling, approximation algorithms, fairness} }
@inproceedings{Pascual2007CooperationinMultiOrganization, author = {F. Pascual and K. Rzadca and D. Trystram}, title = {Cooperation in Multi-Organization Scheduling}, booktitle = {Euro-Par 2007 Proceedings}, year = {2007}, volume = {4641}, series = {LNCS}, pages = {224--233}, publisher = {Springer}, note = {Conference version of \cite{Pascual2009CooperationinMultiOrganization}}, doi = {10.1007/978-3-540-74466-5_25}, keywords = {resource_management, scheduling} }
@inproceedings{Roeblitz2006PlacementofReservations, author = {T. Roeblitz and K. Rzadca}, title = {On the Placement of Reservations into Job Schedules}, booktitle = {Euro-Par 2006 Proceedings}, year = {2006}, volume = {4128}, series = {LNCS}, pages = {198--210}, publisher = {Springer}, abstract = {We present a new method for determining placements of flexible reservation requests into a schedule. For each considered placement the what-if method inserts a placeholder into the schedule and simulates the processing of batch jobs currently known to the system. Each placement is evaluated wrt. well-known scheduling metrics. This information may be used by a Grid reservation service to choose the most likely successful placement of a reservation. According to the results of extensive simulations, the what-if method grants more reservations and improves the performance of local jobs compared to our previously used load method.}, doi = {10.1007/11823285_21}, pdf = {papers/reservation_scheduling_europar2006.pdf}, keywords = {resource_management, scheduling, reservations} }
@phdthesis{Rzadca2008ResourceManagementModels, author = {Rzadca, K.}, title = {Resource Management Models and Algorithms for Multi-Organizational Grids}, school = {INPG and PJIIT}, year = {2008}, pdf = {papers/phd.pdf}, keywords = {scheduling, resource_management}, owner = {krz}, timestamp = {2009.12.02} }
@inproceedings{Rzadca2007SchedulinginMultiOrganization, author = {K. Rzadca}, title = {Scheduling in Multi-Organization Grids: Measuring the Inefficiency of Decentralization}, booktitle = {PPAM 2007 Proceedings}, year = {2007}, number = {4967}, series = {LNCS}, pages = {1048-1058}, publisher = {Springer}, abstract = {We present a novel, generic model of the grid that emphasises the roles of individual organizations that form the system. The model allows us to study the global behaviour of the system without introducing external forms of recompense. Using game-theory and equitable multicriteria optimization, we study three diverse types of computational grids: an off-line system with dedicated uniprocessors, an on-line system with divisible load and an off-line system with parallel jobs. Results show that, unless strong assumptions are made, the complete decentralization leads to a significant loss of performance.}, doi = {10.1007/978-3-540-68111-3_111}, pdf = {papers/multi_org_scheduling-ppam2007.pdf}, keywords = {resource_management, scheduling, fairness, game theory, price of anarchy} }
@inproceedings{Rzadca2006EvaluatingQualityof, author = {K. Rzadca}, title = {Evaluating the Quality of Maximum Variance Cluster Algorithms}, booktitle = {ICCVG 2004 (International Conference on Computer Vision and Graphics) Proceedings}, year = {2006}, volume = {32}, series = {Computational Imaging and Vision}, pages = {981--986}, publisher = {Springer}, keywords = {data_clustering} }
@inproceedings{Rzadca2010ReplicaPlacementin, author = {Rzadca, Krzysztof and Datta, Anwitaman and Buchegger, Sonja}, title = {Replica Placement in P2P Storage: Complexity and Game Theoretic Analyses}, booktitle = {ICDCS 2010, The 30th International Conference on Distributed Computing Systems, Proceedings}, year = {2010}, pages = {599-609}, publisher = {IEEE Computer Society}, abstract = {In peer-to-peer storage systems, peers replicate each others' data in order to increase availability. If the matching is done centrally, the algorithm can optimize data availability in an equitable manner for all participants. However, if matching is decentralized, the peers' selfishness can greatly alter the results, leading to performance inequities that can render the system unreliable and thus ultimately unusable. We analyze the problem using both theoretical approaches (complexity analysis for the centralized system, game theory for the decentralized one) and simulation. We prove that the problem of optimizing availability in a centralized system is NP-hard. In decentralized settings, we show that the rational behavior of selfish peers will be to replicate only with similarly-available peers. Compared to the socially-optimal solution, highly available peers have their data availability increased at the expense of decreased data availability for less available peers. The price of anarchy is high: unbounded in one model, and linear with the number of time slots in the second model. We also propose centralized and decentralized heuristics that, according to our experiments, converge fast in the average case. The high price of anarchy means that a completely decentralized system could be too hostile for peers with low availability, who could never achieve satisfying replication parameters. Moreover, we experimentally show that even explicit consideration and exploitation of diurnal patterns of peer availability has a small effect on the data availability---except when the system has truly global scope. Yet a fully centralized system is infeasible, not only because of problems in information gathering, but also the complexity of optimizing availability. The solution to this dilemma is to create system-wide cooperation rules that allow a decentralized algorithm, but also limit the selfishness of the participants.}, doi = {10.1109/ICDCS.2010.67}, pdf = {papers/serenity_icdcs.pdf}, keywords = {p2p, p2p_storage, game theory, price of anarchy}, owner = {krz}, timestamp = {2010.02.26} }
@inproceedings{Rzadca2003IncrementallyAssessingCluster, author = {K. Rzadca and F. Ferri}, title = {Incrementally Assessing Cluster Tendencies with a Maximum Variance Cluster Algorithm}, booktitle = {IbPRIA (Iberian Conference on Pattern Recognition and Image Analysis) Proceedings}, year = {2003}, volume = {2653}, series = {LNCS}, pages = {868--875}, publisher = {Springer}, pdf = {papers/imva.pdf}, keywords = {data_clustering} }
@inproceedings{Rzadca2005HeterogeneousMultiprocessorScheduling, author = {K. Rzadca and F. Seredynski}, title = {Heterogeneous Multiprocessor Scheduling with Differential Evolution}, booktitle = {IEEE CEC 2005 (Congress of Evolutionary Computation) Proceedings}, year = {2005}, pages = {2840--2847}, publisher = {IEEE Computer Society}, doi = {10.1109/CEC.2005.1555051}, pdf = {papers/scheduling-de.pdf}, keywords = {scheduling, resource_management} }
@article{Rzadca2010MulticastOverlay, author = {Rzadca, Krzysztof. and Tan Teck, Jackson and Datta, Anwitaman}, title = {Multi-Objective Optimization of Multicast Overlay for Collaborative Applications}, journal = {Computer Networks}, year = {2010}, volume = {54}, pages = {1986-2005}, number = {12}, publisher = {Elsevier}, abstract = {Current implementations of real-time collaborative applications rely on a dedicated infrastructure to carry out all synchronizing and communication functions, and require all end nodes to communicate directly with and through the central server. In this paper, we investigate scenarios, in which the most resource intensive functionality of continuous communication among collaborators to disseminate changes is decentralized, utilizing the end users as relays. We observe that communication characteristics of real-time collaboration makes use of existing multicast mechanisms unsuitable. As collaborative editing sessions are typically long, we are able to gather and then use additional parameters of nodes (their instabilities and frequency of sending updates) and communication links (latencies and average costs). We identify several criteria to determine the quality of a multicast tree: cost, latency and instability. We analyze the complexity of these problems and propose algorithms to optimize the communication topology. We also consider the multiobjective problem in which we search for a tree that results in a good trade-off between these measures. Validation of algorithms on numerous graphs shows that it is important to consider the multiobjective problem, as optimal solutions for one performance measure can be far from optimal values of the others. Finally, we briefly present an implementation of a communication library that uses the proposed algorithms to periodically adjust the dissemination tree. }, doi = {10.1016/j.comnet.2010.05.018}, pdf = {papers/momocomm-comnet.pdf}, keywords = {p2p, communication topology, collaborative applications, multi-objective optimization}, owner = {krz}, timestamp = {2010.08.29} }
@article{Rzadca2009PromotingCooperationin, author = {K. Rzadca and D. Trystram}, title = {Promoting Cooperation in Selfish Computational Grids}, journal = {European Journal of Operational Research}, year = {2009}, volume = {199}, pages = {647-657}, publisher = {Elsevier}, abstract = {In distributed computing the recent paradigm shift from centrally-owned clusters to organizationally distributed computational grids introduces a number of new challenges in resource management and scheduling. In this work, we study the problem of Selfish Load Balancing which extends the well-known Load Balancing (LB) problem to scenarios in which each processor is concerned only with the performance of its local jobs. We propose a simple mathematical model for such systems and a novel function for computing the cost of the execution of foreign jobs. Then, we use the game-theoretic framework to analyze the model in order to compute the expected result of LB performed in a grid formed by two clusters. We show that, firstly, LB is a socially-optimal strategy, and secondly, for similarly loaded clusters, it is sufficient to collaborate during longer time periods in order to make LB the dominant strategy for each cluster. However, we show that if we allow clusters to make decisions depending on their current queue length, LB will never be performed. Then, we propose a LB algorithm which balances the load more equitably, even in the presence of overloaded clusters. Our algorithms do not use any external forms of compensation (such as money). The load is balanced only by considering the parameters of execution of jobs. This analysis is assessed experimentally by simulation, involving scenarios with multiple clusters and heterogeneous load.}, doi = {10.1016/j.ejor.2007.06.067}, pdf = {papers/promoting_cooperation-ejor.pdf}, issue = { 3 }, keywords = {resource_management, scheduling, game theory} }
@inproceedings{Rzadca2006BriefAnnouncementPromoting, author = {K. Rzadca and D. Trystram}, title = {Brief Announcement: Promoting Cooperation in Selfish Grids}, booktitle = {SPAA 2006 (Annual ACM Symposium on Parallelism in Algorithms \& Architectures) Proceedings}, year = {2006}, pages = {332}, publisher = {ACM Press}, note = {Conference version of \cite{Rzadca2009PromotingCooperationin}}, keywords = {resource_management, scheduling} }
@inproceedings{Rzadca2007FairGameTheoreticResource, author = {K. Rzadca and D. Trystram and A. Wierzbicki}, title = {Fair Game-Theoretic Resource Management in Dedicated Grids}, booktitle = {IEEE International Symposium on Cluster Computing and the Grid (CCGRID 2007), Proceedings}, year = {2007}, publisher = {IEEE Computer Society}, abstract = {We study two problems directly resulting from organizational decentralization of the Grid. Firstly, the problem of fair scheduling in systems in which the grid scheduler has complete control of processors' schedules. Secondly, the problem of fair and feasible scheduling in decentralized case, in which the grid scheduler can only suggest a schedule, which can be later modified by a processor's owner. Using game theory, we show that scheduling in decentralized case is analogous to the Prisoner's Dilemma game. Moreover, the Nash equilibrium results in significant performance drop. Therefore, a strong community control is required to achieve acceptable performance.}, doi = {10.1109/CCGRID.2007.52}, pdf = {papers/fair_game_theoretic_resource-ccgrid2007.pdf}, keywords = {resource_management, scheduling, fairness, game theory, price of anarchy} }
@inproceedings{Rzadca2009MulticastTreesCollaborative, author = {K. Rzadca and J. Tan Teck Yong and A. Datta}, title = {Multicast Trees for Collaborative Applications}, booktitle = {9th IEEE/ACM International Symposium on Cluster Computing and the Grid, Proceedings}, year = {2009}, pages = {60-67}, publisher = {IEEE Computer Society}, abstract = {Current implementations of real-time collaborative applications rely on a dedicated infrastructure to carry out all synchronizing and communication functions, and require all end nodes to communicate directly with and through the central server. In this paper, we investigate an architecture, in which the most resource intensive functionality of continuous communication among collaborators to disseminate changes is decentralized, utilizing the end users as relays. We observe that communication characteristics of real-time collaboration makes use of existing multicast mechanisms unsuitable. As collaborative editing sessions are typically long, we are able to gather and then use additional parameters of nodes (their instabilities and frequency of sending updates) and communication links (latencies and average costs). We identify several criteria to determine the quality of a multicast tree: cost, latency and instability. We analyze the complexity of these problems and propose algorithms to optimize the communication topology. We also consider the multiobjective problem in which we search for a tree that results in a good trade-off between these measures. Validation of algorithms on numerous graphs shows that it is important to consider the multiobjective problem, as optimal solutions for one performance measure can be far from optimal values of the others.}, doi = {10.1109/CCGRID.2009.38}, pdf = {papers/multicast_trees-ccgrid09.pdf}, keywords = {p2p, communication topology, collaborative applications, multi-objective optimization}, owner = {krz}, timestamp = {2009.08.18} }
@incollection{Wierzbicki2009SupportingCollaborationand, author = {Wierzbicki, Adam and Datta, Anwaitaman. and Zaczek, Lukasz and Rzadca, Krzysztof}, title = {Supporting Collaboration and Creativity Through Mobile P2P Computing}, booktitle = {Handbook of Peer-to-Peer Networking}, publisher = {Springer}, year = {2010}, editor = {Shen, X. and Yu, H. and Buford, J. and Akon, M.}, pages = {1367-1400}, optaddress = {New York Dordrecht Heidelberg London}, doi = {10.1007/978-0-387-09751-0}, keywords = {p2p}, owner = {krz}, timestamp = {2009.08.18} }
@inproceedings{Wojtyla2006ArtificialImmuneSystems, author = {G. Wojtyla and K. Rzadca and F. Seredynski}, title = {Artificial Immune Systems Applied to Multiprocessor Scheduling}, booktitle = {PPAM 2005 Proceedings}, year = {2006}, volume = {3911}, series = {LNCS}, pages = {904--911}, publisher = {Springer}, keywords = {resource_management, scheduling} }
@inproceedings{Xin2009StereoTrustgroupbased, author = {Liu Xin and A. Datta and K. Rzadca and Lim Ee-Peng}, title = {StereoTrust: A Group Based Personalized Trust Model}, booktitle = {CIKM 2009, The 18th ACM Conference on Information and Knowledge Management, Proceedings}, year = {2009}, pages = {7-16}, publisher = {ACM Press}, abstract = {Trust plays important roles in diverse decentralized environments, including our society at large. Computational trust models help to, for instance, guide users' judgements in online auction sites about other users; or determine quality of contributions in web 2.0 sites. Most of the existing trust models, however, require historical information about past behavior of a specific agent being evaluated - information that is not always available. In contrast, in real life interactions among users, in order to make the first guess about the trustworthiness of a stranger, we commonly use our "instinct" - essentially stereotypes developed from our past interactions with "similar" people. We propose StereoTrust, a computational trust model inspired by real life stereotypes. A user forms stereotypes using her previous transactions with other agents. A stereotype contains certain features of agents and an expected outcome of the transaction. These features can be taken from agents' profile information, or agents' observed behavior in the system. When facing a stranger, the stereotypes matching stranger's profile are aggregated to derive his expected trust. Additionally, when some information about stranger's previous transactions is available, StereoTrust uses it to refine the stereotype matching. According to our experiments, StereoTrust compares favorably with existing trust models that use different kind of information and more complete historical information. Moreover, because evaluation is done according to user's personal stereotypes, the system is completely distributed and the result obtained is personalized. StereoTrust can be used as a complimentary mechanism to provide the initial trust value for a stranger, especially when there is no trusted, common third parties.}, doi = {10.1145/1645953.1645958}, pdf = {papers/stereotrust_cikm09.pdf}, keywords = {trust} }
This file was generated by bibtex2html 1.96.